From db921ae95ec264479a34a0026f9dba3cbc586733 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Sat, 22 Jul 2023 18:19:33 +0800 Subject: [PATCH 01/34] Fix launch error when PADDLE_TRAINER_ENDPOINTS is too long (#55478) * fix new launch * fix ps uit --- python/paddle/distributed/backup_env.py | 35 +++++++++++++++++++ .../distributed/fleet/base/role_maker.py | 8 +++-- .../distributed/fleet/elastic/manager.py | 4 ++- .../launch/controllers/controller.py | 3 ++ .../launch/utils/process_context.py | 26 +++++++++++++- python/paddle/distributed/parallel.py | 9 +++-- python/paddle/fluid/executor.py | 3 +- test/legacy_test/test_run.py | 1 + 8 files changed, 81 insertions(+), 8 deletions(-) create mode 100644 python/paddle/distributed/backup_env.py diff --git a/python/paddle/distributed/backup_env.py b/python/paddle/distributed/backup_env.py new file mode 100644 index 0000000000000..60428b9a2025d --- /dev/null +++ b/python/paddle/distributed/backup_env.py @@ -0,0 +1,35 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os + +g_backup_envs = None + + +def getenv_or_backup(name, default=None): + global g_backup_envs + if g_backup_envs is None: + backup_path = os.getenv('PADDLE_BACKUP_ENV_PATH') + if backup_path is None: + g_backup_envs = {} + else: + with open(backup_path, 'r') as f: + g_backup_envs = json.load(f) + + value = os.getenv(name) + if value is not None: + return value + else: + return g_backup_envs.get(name, default) diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 3f2b22d8795c2..113a0132f4c12 100755 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -25,6 +25,8 @@ ) from paddle.fluid import core +from ...backup_env import getenv_or_backup + __all__ = [] @@ -844,7 +846,9 @@ def _ps_env(self): # each role will execute it self._server_endpoints = self._server_endpoints.split(",") - self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None) + self._worker_endpoints = getenv_or_backup( + "PADDLE_TRAINER_ENDPOINTS", None + ) if self._worker_endpoints is not None: self._worker_endpoints = self._worker_endpoints.split(",") else: @@ -1066,7 +1070,7 @@ def _collective_env(self): self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") assert self._training_role == "TRAINER" self._role = Role.WORKER - self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") + self._worker_endpoints = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS") self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") if self._worker_endpoints is None: # back to non_distributed execution. diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py index 5e0de5c3120e3..00151a8dee5f1 100644 --- a/python/paddle/distributed/fleet/elastic/manager.py +++ b/python/paddle/distributed/fleet/elastic/manager.py @@ -25,6 +25,8 @@ from paddle.distributed.fleet import cloud_utils, launch_utils from paddle.distributed.utils.log_utils import get_logger +from ...backup_env import getenv_or_backup + logger = get_logger("INFO", "ELASTIC") ELASTIC_EXIT_CODE = 101 @@ -149,7 +151,7 @@ def __init__(self, args, etcd_client): self.np = len(self.trainers.split(",")) self.start_port = int(os.getenv("PADDLE_PORT", "6170")) self.dist_endpoints = os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS', '') - trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS', '') + trainer_endpoints = getenv_or_backup('PADDLE_TRAINER_ENDPOINTS', '') self.trainer_endpoints_list = trainer_endpoints.split(",") else: self.trainers = args.ips or os.getenv('PADDLE_TRAINERS', '') diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py index 9769ec9d6bf3f..25becbba6f329 100644 --- a/python/paddle/distributed/launch/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import os import signal import sys @@ -244,6 +245,8 @@ def add_container( is_init=False, ): if not container: + envs = copy.deepcopy(envs) + envs['PADDLE_LOG_DIR'] = str(os.path.abspath(self.ctx.args.log_dir)) container = self.new_container( entrypoint=entrypoint, envs=envs, out=log_file, err=log_file ) diff --git a/python/paddle/distributed/launch/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py index 6543d7bd9ebae..8b14d5417a68b 100644 --- a/python/paddle/distributed/launch/utils/process_context.py +++ b/python/paddle/distributed/launch/utils/process_context.py @@ -12,12 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os import signal import subprocess import sys import time +LIMIT_LEN_ENVS = ["TRAINER_IP_PORT_LIST", "PADDLE_TRAINER_ENDPOINTS"] + class ProcessContext: def __init__( @@ -42,9 +45,30 @@ def __init__( def _start(self): pre_fn = os.setsid if self._group else None + log_dir = self._env["PADDLE_LOG_DIR"] + os.makedirs(log_dir, exist_ok=True) + + rank = self._env.get("PADDLE_TRAINER_ID") + if rank is not None: + rank = int(rank) + backup_env_path = str( + os.path.join(log_dir, f'backup_env.{rank}.json') + ) + envs = {"PADDLE_BACKUP_ENV_PATH": backup_env_path} + + max_len = int(os.getenv('PADDLE_ENV_LIMIT_LEN', 48000)) + for k, v in self._env.items(): + if k not in LIMIT_LEN_ENVS or len(v) < max_len: + envs[k] = v + + with open(backup_env_path, 'w') as f: + json.dump(dict(self._env), f, indent=4, sort_keys=True) + else: + envs = self._env + self._proc = subprocess.Popen( self._cmd, - env=self._env, + env=envs, stdout=self._stdout, stderr=self._stderr, preexec_fn=self._preexec_fn or pre_fn, diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index cc6ab5384ca4e..a34807d2b7377 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -56,6 +56,7 @@ from paddle.utils import deprecated from . import parallel_helper +from .backup_env import getenv_or_backup __all__ = [] @@ -704,7 +705,7 @@ def __init__(self): selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") self._device_id = int(selected_xpus[0]) - self._trainer_endpoints = os.getenv( + self._trainer_endpoints = getenv_or_backup( "PADDLE_TRAINER_ENDPOINTS", "" ).split(",") self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") @@ -878,7 +879,7 @@ def _is_cpuonly(backend): def _check_var_exists(var_name): - var = os.environ.get(var_name, None) + var = getenv_or_backup(var_name, None) if var is None: raise ValueError( "paddle.distributed initialize error, " @@ -1060,7 +1061,9 @@ def train(): if endpoints is None: endpoints = os.getenv("PADDLE_MASTER", None) if endpoints is None: - endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0] + endpoints = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS").split(',')[ + 0 + ] assert endpoints, ( "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' " "must be specified, for example 'export MASTER_ADDR=127.0.0.1' " diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index b3deb787960e6..a9afe7f5c8d0d 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -525,8 +525,9 @@ def _to_str(var): def _prepare_fleet_executor(): from ..distributed.fleet.proto import fleet_executor_desc_pb2 + from ..distributed.backup_env import getenv_or_backup - trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "") + trainer_endpoints_str = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS", "") trainer_endpoints = trainer_endpoints_str.split(',') fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc() cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0)) diff --git a/test/legacy_test/test_run.py b/test/legacy_test/test_run.py index 467b9ef35c67b..3174dd7005ce6 100644 --- a/test/legacy_test/test_run.py +++ b/test/legacy_test/test_run.py @@ -55,6 +55,7 @@ def get_files(pth, prefix): if isfile(join(pth, f)) and not f.endswith('gpu.log') and not f.startswith('envlog') + and not f.startswith('backup_env') ] From 6da9db501aaabd166c6591c5a7608df746aba226 Mon Sep 17 00:00:00 2001 From: liuzhenhai93 Date: Sat, 22 Jul 2023 22:16:57 +0800 Subject: [PATCH 02/34] fix group_shard3_get_all_parameter (#55572) --- .../sharding/group_sharded_stage3.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index f6b86ce736d78..b9ca53aeef0a1 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -651,6 +651,11 @@ def get_all_parameters(self, convert2cpu=False): for param in trainable_params: t_flow.full_param[param.name][0]._share_buffer_to(param) + # a _allgather_buffer call should be matched with a _release_param call later, + # but the _allgather_buffer call here has no match. + # TODO(liuzhenhai): set a flag here and release full param before forward pass of the first layer, + # when _allgather_buffer is called for get_all_parameters and convert2cpu is false + self._optim._parameter_list = self._ori_parameter_list self._optim._param_groups = self._ori_param_groups @@ -924,14 +929,11 @@ class TaskFlow: def __init__( self, - full_param={}, - full_grad={}, - use_calc={}, callback=None, ): - self.full_param = full_param - self.full_grad = full_grad - self.use_calc = use_calc + self.full_param = {} + self.full_grad = {} + self.use_calc = {} self.callback = callback @@ -1004,6 +1006,9 @@ def _allgather_buffer( offload=False, convert2cpu=False, ): + if convert2cpu: + assert sync_wait + for param in trainable_params: if param.status == "all": param.use_count += 1 @@ -1020,20 +1025,22 @@ def _allgather_buffer( if sync_wait: with paddle.amp.auto_cast(enable=False): task.wait() - full_param._slice(0, param._numel())._share_buffer_to(param) - param.fw_storage._clear() - param.fw_storage = None - param.status = "all" - param.use_count += 1 + if convert2cpu: + # status is not changed + cpu_full_param = _device2cpu( + full_param._slice(0, param._numel()) + ) + full_param._clear_data() + del full_param + full_param = cpu_full_param + task = None + else: + full_param._slice(0, param._numel())._share_buffer_to(param) + param.fw_storage._clear() + param.fw_storage = None + param.status = "all" + param.use_count += 1 task_flow.full_param[param.name] = (full_param, task) - - # parameter converts to cpu - if convert2cpu: - p_name = param.name - param = _device2cpu(param) - del task_flow.full_param[p_name] - task_flow.full_param[p_name] = (param, None) - return task_flow From 2b8e62859b8f83d646c6a53c3572f88440cdc0bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?= <83450930+Liyulingyue@users.noreply.github.com> Date: Mon, 24 Jul 2023 08:26:53 +0800 Subject: [PATCH 03/34] =?UTF-8?q?[IR=20Dialect]=20=E2=9A=94Elden=20chapter?= =?UTF-8?q?=201.1=E2=9A=94=20=20(#55525)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * IntArrayAttributeStorage --- paddle/fluid/ir/dialect/pd_attribute.cc | 4 +++- paddle/fluid/ir/dialect/pd_attribute.h | 2 +- paddle/fluid/ir/dialect/pd_attribute_storage.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/ir/dialect/pd_attribute.cc b/paddle/fluid/ir/dialect/pd_attribute.cc index 78a71013b87b5..687e836dc70f3 100644 --- a/paddle/fluid/ir/dialect/pd_attribute.cc +++ b/paddle/fluid/ir/dialect/pd_attribute.cc @@ -16,7 +16,9 @@ namespace paddle { namespace dialect { -phi::IntArray IntArrayAttribute::data() const { return storage()->GetAsKey(); } +const phi::IntArray& IntArrayAttribute::data() const { + return storage()->GetAsKey(); +} phi::DataType DataTypeAttribute::data() const { return storage()->GetAsKey(); } diff --git a/paddle/fluid/ir/dialect/pd_attribute.h b/paddle/fluid/ir/dialect/pd_attribute.h index 7d921ebd1df1b..5af73b2c0f48b 100644 --- a/paddle/fluid/ir/dialect/pd_attribute.h +++ b/paddle/fluid/ir/dialect/pd_attribute.h @@ -33,7 +33,7 @@ class IntArrayAttribute : public ir::Attribute { return storage() < right.storage(); } - phi::IntArray data() const; + const phi::IntArray &data() const; }; class ScalarAttribute : public ir::Attribute { diff --git a/paddle/fluid/ir/dialect/pd_attribute_storage.h b/paddle/fluid/ir/dialect/pd_attribute_storage.h index 78bf9beb089d7..1877e5043fc65 100644 --- a/paddle/fluid/ir/dialect/pd_attribute_storage.h +++ b/paddle/fluid/ir/dialect/pd_attribute_storage.h @@ -48,7 +48,7 @@ struct IntArrayAttributeStorage : public ir::AttributeStorage { (data_.FromTensor() == key.FromTensor()); } - ParamKey GetAsKey() const { return ParamKey(data_); } + const ParamKey &GetAsKey() const { return data_; } private: phi::IntArray data_; From bd60757d03f062583b95c966e84036985c830f3b Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Jul 2023 10:35:46 +0800 Subject: [PATCH 04/34] [AutoParallel] Add shard tensor and DistAttr api (#55494) * add shard tensor api * add DistAttr api * add unittest for coverage * fix process mesh sample code * fix checking error --- python/paddle/distributed/__init__.py | 8 +- .../paddle/distributed/auto_parallel/api.py | 126 ++++++++++++++++++ .../distributed/auto_parallel/process_mesh.py | 10 +- test/auto_parallel/CMakeLists.txt | 1 + test/auto_parallel/test_dist_tensor.py | 8 +- test/auto_parallel/test_shard_tensor_api.py | 78 +++++++++++ 6 files changed, 226 insertions(+), 5 deletions(-) create mode 100644 python/paddle/distributed/auto_parallel/api.py create mode 100644 test/auto_parallel/test_shard_tensor_api.py diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 8f6237bfa4c4b..183f307607c36 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -61,8 +61,11 @@ get_backend, ) # noqa: F401 +from .auto_parallel.process_mesh import ProcessMesh # noqa: F401 +from .auto_parallel.api import DistAttr # noqa: F401 + from .auto_parallel import shard_op # noqa: F401 -from .auto_parallel import shard_tensor # noqa: F401 +from .auto_parallel.api import shard_tensor # noqa: F401 from .fleet import BoxPSDataset # noqa: F401 @@ -120,4 +123,7 @@ "reduce_scatter", "is_available", "get_backend", + "ProcessMesh", + "DistAttr", + "shard_tensor", ] diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py new file mode 100644 index 0000000000000..b25799d058ad2 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/api.py @@ -0,0 +1,126 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.distributed.auto_parallel.process_mesh import ProcessMesh +from paddle.framework import core + +# There are the auto parallel API of the unified version of dynamic and static mode. +# Some APIs have the same name with the previous APIs implementation, which are +# a temporary state, and the APIs here will eventually be used. + + +class DistAttr(core.TensorDistAttr): + """ + DistAttr specifies how tensors are distributed or sliced on ProcessMesh. + + Args: + mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes. + sharding_specs(list[str|None]): The specification describing how to shard the Tensor. + + Examples: + + .. code-block:: python + + import paddle + import paddle.distributed as dist + + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"]) + dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y']) + + print(dist_attr) + """ + + def __init__(self, mesh, sharding_specs): + # 1. inputs checking + if not isinstance(mesh, ProcessMesh): + raise ValueError( + "The mesh must be an instance of paddle.distributed.ProcessMesh." + ) + if not isinstance(sharding_specs, list): + raise ValueError("The sharding_specs must be an instance of list.") + assert all( + isinstance(dim_name, str) or dim_name is None + for dim_name in sharding_specs + ), 'The dimension name in sharding_specs must be an instance of str.' + + dims_mapping = [ + mesh.dim_names.index(dim_name) if dim_name is not None else -1 + for dim_name in sharding_specs + ] + + # 2. init core.TensorDistAttr + core.TensorDistAttr.__init__(self) + self.process_mesh = mesh + self.dims_mapping = dims_mapping + + +def shard_tensor( + data, dtype=None, place=None, stop_gradient=True, dist_attr=None +): + """ + Constructs a ``paddle.Tensor`` with distributed attributes from ``data``, + which can scalar, tuple, list, numpy.ndarray, paddle.Tensor. + + If the ``data`` is already a Tensor, transform it to a Distributed Tensor. + + Args: + data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor. + Can be a scalar, list, tuple, numpy.ndarray, paddle.Tensor. + dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , + 'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8', + 'complex64' , 'complex128'. Default: None, infers dtype from ``data`` + except for python float number which gets dtype from ``get_default_type`` . + place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be + CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is + string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. + stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True. + dist_attr(paddle.distributed.DistAttr): Specify how tensors are distributed or sliced on ProcessMesh. + + Returns: + Tensor: A Tensor constructed from ``data`` with distributed attributes. + + Examples: + + .. code-block:: python + + import paddle + import paddle.distributed as dist + + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"]) + dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y']) + + # dense tensor + a = paddle.to_tensor([[1,2,3], + [5,6,7]]) + # distributed tensor + d_tensor = dist.shard_tensor(a, dist_attr=dist_attr) + + print(d_tensor) + """ + # 1. create dense tensor + # `paddle.to_tensor` supports both dynamic and static mode + data = paddle.to_tensor(data) + + # 2. create dist tensor + assert len(dist_attr.dims_mapping) == len( + list(data.shape) + ), "The length of sharding_specs must be same as the shape of the input tensor." + + if paddle.in_dynamic_mode(): + return paddle.Tensor(data, dist_attr=dist_attr) + else: + raise NotImplementedError( + "The `paddle.distributed.shard_tensor` for static mode will be implemented later." + ) diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py index 1c2f292e5f861..a6ad3355d7d2c 100644 --- a/python/paddle/distributed/auto_parallel/process_mesh.py +++ b/python/paddle/distributed/auto_parallel/process_mesh.py @@ -82,8 +82,9 @@ class ProcessMesh(core.ProcessMesh): .. code-block:: python import paddle + import paddle.distributed as dist - mesh = auto.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"]) + mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"]) assert mesh.shape == [2, 3] assert mesh.process_ids == [2, 4, 5, 0, 1, 3] @@ -162,6 +163,13 @@ def mesh(self): """ return self._mesh + @property + def dim_names(self): + """ + Get the underlying dimension names of ProcessMesh. + """ + return self._dim_names + @property def unique_id(self): """ diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt index fe67e34c03d39..83c2ae84182fb 100644 --- a/test/auto_parallel/CMakeLists.txt +++ b/test/auto_parallel/CMakeLists.txt @@ -153,6 +153,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_engine_save_load MODULES test_engine_save_load) py_test_modules(test_rule_based_tuner MODULES test_rule_based_tuner) py_test_modules(test_dist_tensor MODULES test_dist_tensor) + py_test_modules(test_shard_tensor_api MODULES test_shard_tensor_api) # End of unittests WITH single card WITHOUT timeout endif() diff --git a/test/auto_parallel/test_dist_tensor.py b/test/auto_parallel/test_dist_tensor.py index 58ebc0850046b..61705a322e2d6 100644 --- a/test/auto_parallel/test_dist_tensor.py +++ b/test/auto_parallel/test_dist_tensor.py @@ -17,20 +17,22 @@ import numpy as np import paddle +import paddle.distributed as dist class TestDistTensor(unittest.TestCase): def test_dist_tensor_creation(self): shape = [10, 5] - dist_attr = paddle.fluid.core.TensorDistAttr() + mesh = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]) + dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y']) # create dist tensor using numpy - dist_tensor_with_numpy = paddle.Tensor( + dist_tensor_with_numpy = dist.shard_tensor( np.ones(shape, dtype=np.float32), dist_attr=dist_attr ) # create dist tensor using tensor - dist_tensor_with_tensor = paddle.Tensor( + dist_tensor_with_tensor = dist.shard_tensor( paddle.ones(shape), dist_attr=dist_attr ) diff --git a/test/auto_parallel/test_shard_tensor_api.py b/test/auto_parallel/test_shard_tensor_api.py new file mode 100644 index 0000000000000..124c7dc7ba39e --- /dev/null +++ b/test/auto_parallel/test_shard_tensor_api.py @@ -0,0 +1,78 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.distributed as dist + + +class TestDistAttrBasic(unittest.TestCase): + def test_mesh_argument_error(self): + exception = None + try: + mesh = [[0, 1], [2, 3]] + dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y']) + except ValueError as ex: + self.assertIn( + "The mesh must be an instance of paddle.distributed.ProcessMesh", + str(ex), + ) + exception = ex + + self.assertIsNotNone(exception) + + def test_sharding_specs_argument_error(self): + exception = None + try: + mesh = dist.ProcessMesh( + [[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"] + ) + dist_attr = dist.DistAttr( + mesh=mesh, sharding_specs={"x": 0, "y": 1} + ) + except ValueError as ex: + self.assertIn( + "The sharding_specs must be an instance of list", str(ex) + ) + exception = ex + + self.assertIsNotNone(exception) + + +class TestShardTensorBasic(unittest.TestCase): + # remove this test after static mode is supported + def test_static_mode_unimplemented(self): + exception = None + try: + paddle.enable_static() + mesh = dist.ProcessMesh( + [[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"] + ) + dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y']) + a = paddle.to_tensor([[1, 2, 3], [5, 6, 7]]) + d_tensor = dist.shard_tensor(a, dist_attr=dist_attr) + except NotImplementedError as ex: + self.assertIn( + "The `paddle.distributed.shard_tensor` for static mode will be implemented later", + str(ex), + ) + exception = ex + paddle.disable_static() + + self.assertIsNotNone(exception) + + +if __name__ == "__main__": + unittest.main() From 1f3e6ec49ed4eacef4cf397491a9dc4a32f7a0ae Mon Sep 17 00:00:00 2001 From: xingmingyyj <135400902+xingmingyyj@users.noreply.github.com> Date: Mon, 24 Jul 2023 10:37:14 +0800 Subject: [PATCH 05/34] Order print attribute map (#55518) * fix_ir_printer * Update ir_printer.cc * Update ir_printer.cc * Update ir_printer.cc * Update ir_printer.cc * Update ir_printer.cc * Update paddle/ir/core/ir_printer.cc Co-authored-by: kangguangli * Update ir_printer.cc --------- Co-authored-by: kangguangli --- paddle/ir/core/ir_printer.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/ir/core/ir_printer.cc b/paddle/ir/core/ir_printer.cc index a322e8fca9ffd..8890364565518 100644 --- a/paddle/ir/core/ir_printer.cc +++ b/paddle/ir/core/ir_printer.cc @@ -215,11 +215,14 @@ void IrPrinter::PrintOpResult(const Operation* op) { } void IrPrinter::PrintAttributeMap(const Operation* op) { + AttributeMap attributes = op->attributes(); + std::map> order_attributes( + attributes.begin(), attributes.end()); os << " {"; PrintInterleave( - op->attributes().begin(), - op->attributes().end(), + order_attributes.begin(), + order_attributes.end(), [this](std::pair it) { this->os << it.first; this->os << ":"; From cf76e7ae7cf6d26fc340ff0d3677870182688cd1 Mon Sep 17 00:00:00 2001 From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com> Date: Mon, 24 Jul 2023 10:46:37 +0800 Subject: [PATCH 06/34] [Semi-Auto] add split spmd rule (#55397) * add split spmd rule * add pytest in cmake file * small fix --- .../auto_parallel/spmd_rules/common.cc | 4 +- .../auto_parallel/spmd_rules/rules.h | 5 + .../spmd_rules/split_spmd_rule.cc | 126 +++++++++++ .../spmd_rules/split_spmd_rule.h | 40 ++++ test/auto_parallel/spmd_rules/CMakeLists.txt | 1 + .../spmd_rules/test_split_rule.py | 205 ++++++++++++++++++ 6 files changed, 379 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h create mode 100644 test/auto_parallel/spmd_rules/test_split_rule.py diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc index 47c0d9a683fc5..a0f46e1c46299 100644 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc @@ -182,8 +182,8 @@ TensorDistAttr ReplicatedOnMesh(const TensorDistAttr& src_dist_attr) { void VerifySpecs(const std::vector& specs, const std::string& op_name) { for (size_t i = 0, n = specs.size(); i < n; ++i) { - std::vector shape = specs[i].shape(); - std::vector dims_mapping = specs[i].dims_mapping(); + const std::vector& shape = specs[i].shape(); + const std::vector& dims_mapping = specs[i].dims_mapping(); PADDLE_ENFORCE_EQ(shape.size(), dims_mapping.size(), phi::errors::InvalidArgument( diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h index c58333d0fb701..bba4339198021 100644 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h @@ -23,6 +23,7 @@ #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h" // TODO(ljz) Automatic this process in cmake file. namespace paddle { @@ -150,6 +151,10 @@ REGISTER_SPMD_RULE(log_softmax, SoftmaxSPMDRule); REGISTER_SPMD_RULE(cross_entropy_with_softmax, CrossEntropyWithSoftmaxSPMDRule); REGISTER_SPMD_RULE(softmax_with_cross_entropy, CrossEntropyWithSoftmaxSPMDRule); +// split rule +REGISTER_SPMD_RULE(split, SplitSPMDRule); +REGISTER_SPMD_RULE(split_with_num, SplitSPMDRule); + } // namespace auto_parallel } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc new file mode 100644 index 0000000000000..59c962dab8982 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h" +#include +#include +#include "paddle/phi/core/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +using phi::distributed::auto_parallel::str_join; + +std::pair, std::vector> +SplitSPMDRule::InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) { + // step0: Verify Input Args Based on Elementwise Logic + int64_t ninputs = input_specs.size(); + PADDLE_ENFORCE_EQ( + ninputs, + 1, + phi::errors::InvalidArgument("The size of InputSpec in split must " + "be equal to 1, but got [%d].", + ninputs)); + VerifySpecs(input_specs, "split"); + + // step1: Build Einsum Notation + int64_t ndim = input_specs[0].shape().size(); + int64_t noutput = 0; + // split api uses num or sections as attribute + if (attrs.find("num") != attrs.end()) { + noutput = ExtractAttr("num", attrs); + } else if (attrs.find("sections") != attrs.end()) { + std::vector sections = + ExtractAttr>("sections", attrs); + noutput = sections.size(); + } + int64_t axis = ExtractAttr("axis", attrs); + if (axis < 0) { + axis += ndim; + } + std::string alphabet = "abcdefghijlmnopqrstuvwxyz"; + + // get einsum notation for input, use a special + // notation 'k' to mark the splitted axis in input + std::vector input_axes_vec; + std::string input_axes = alphabet.substr(0, ndim); + input_axes[axis] = 'k'; + input_axes_vec.emplace_back(input_axes); + + // get einsum notation for output + std::string output_axes(input_axes); + // the splitted axis cannot be sharded, set its notation + // with the special '1' to set its dim mapping to -1. + output_axes[axis] = '1'; + + // step2: Sharding Propogation + // step2.1: merge input shardings + std::vector>> axes_sharding_info; + axes_sharding_info = GetAxesDimsMappingPair(input_axes_vec, input_specs); + std::unordered_map axis_to_dim_map = + ShardingMergeForTensors(axes_sharding_info); + + // step2.2: infer output dimsmapping from merged input dimsmapping + std::vector output_dims_mapping = + GetDimsMappingForAxes(output_axes, axis_to_dim_map); + + // get the dist attributes for all outputs, the + // dist attributes are same for all outputs. + std::vector output_dist_attrs; + for (int64_t i = 0; i < noutput; i++) { + output_dist_attrs.emplace_back( + CopyTensorDistAttrForOutput(input_specs[0].dist_attr())); + output_dist_attrs[i].set_dims_mapping(output_dims_mapping); + } + + // step2.3 get new dist attribute for input. the splitted + // cannot be sharded, if it is sharded, set it to replicated. + std::vector new_input_dist_attrs; + new_input_dist_attrs.emplace_back(input_specs[0].dist_attr()); + std::vector new_input_dims_mapping(input_specs[0].dims_mapping()); + new_input_dims_mapping[axis] = -1; + new_input_dist_attrs[0].set_dims_mapping(new_input_dims_mapping); + + // Step2.4 handle input tensor partial (TODO) + VLOG(4) << "SplitSPMDRule InferForward: "; + for (int64_t i = 0; i < ninputs; i++) { + VLOG(4) << "Input" << std::to_string(i) << " shape: [" + << str_join(input_specs[i].shape()) << "] " + << "einsum_notation: " << input_axes << " src_dims_mapping: [" + << str_join(input_specs[i].dims_mapping()) << "] " + << "dst_dims_mapping: [" + << str_join(new_input_dist_attrs[i].dims_mapping()) << "]"; + } + for (int64_t i = 0; i < noutput; i++) { + VLOG(4) << "Output" << std::to_string(i) << " dims_mapping: [" + << str_join(output_dims_mapping) << "]"; + } + + return {new_input_dist_attrs, output_dist_attrs}; +} + +std::pair, std::vector> +SplitSPMDRule::InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) { + PADDLE_THROW(phi::errors::Unimplemented( + "InferBackward of SplitPMDRule is NOT implemented yet.")); + + return {}; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h new file mode 100644 index 0000000000000..f974e4cccce05 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +class SplitSPMDRule : public SPMDRuleBase { + public: + std::pair, std::vector> + InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) override; + + std::pair, std::vector> + InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) override; +}; +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt index ed1cf37389eee..1da9d4674c381 100644 --- a/test/auto_parallel/spmd_rules/CMakeLists.txt +++ b/test/auto_parallel/spmd_rules/CMakeLists.txt @@ -8,6 +8,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_matmul_rule MODULES test_embedding_rule) py_test_modules(test_matmul_rule MODULES test_replicated_rule) py_test_modules(test_matmul_rule MODULES test_softmax_rule) + py_test_modules(test_split_rule MODULES test_split_rule) # End of unittests WITH single card WITHOUT timeout endif() diff --git a/test/auto_parallel/spmd_rules/test_split_rule.py b/test/auto_parallel/spmd_rules/test_split_rule.py new file mode 100644 index 0000000000000..1cd32d1bcf2b3 --- /dev/null +++ b/test/auto_parallel/spmd_rules/test_split_rule.py @@ -0,0 +1,205 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.static.completion import get_spmd_rule +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.distributed.fleet import auto + + +class TestReductionSPMDRule(unittest.TestCase): + """ + Unit tests for split spmd rule. + """ + + def setUp(self): + self.rule = get_spmd_rule("split") + + x_shape = [64, 32, 48] + process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3]) + + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [1, 0] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + + self.attrs = { + 'num_or_sections': 2, + 'axis': 1, + } + + def test_single_mesh_dim(self): + # num_or_sections = 2, axis = 1 + # [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1] + self.rule = get_spmd_rule("split_with_num") + self.attrs = {} + self.attrs['num'] = 2 + self.attrs['axis'] = 1 + self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 2) + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1]) + + # num_or_sections = [15, 16, 17], axis = 2 + # [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1], [0, -1, -1] + self.rule = get_spmd_rule("split") + self.attrs = {} + self.attrs['sections'] = [15, 16, 17] + self.attrs['axis'] = 2 + self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 3) + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0, -1, -1]) + + # num_or_sections = [15, 16, 17], axis = 2 + # [-1, -1, 0] --> [-1, -1, -1], [-1, -1, -1], [-1 -1, -1], [-1, -1, -1] + self.attrs = {} + self.attrs['sections'] = [15, 16, 17] + self.attrs['axis'] = 2 + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 3) + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1]) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[2].dims_mapping, [-1, -1, -1] + ) + + # num_or_sections = 2, axis = -2 + # [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1] + self.rule = get_spmd_rule("split_with_num") + self.attrs = {} + self.attrs['num'] = 2 + self.attrs['axis'] = -2 + self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 2) + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1]) + self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1]) + + def test_multi_mesh_dim(self): + process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]]) + self.x_dist_tensor_spec.set_process_mesh(process_mesh) + self.x_dist_tensor_spec.shape = [96, 32, 48, 24] + + # num_or_sections = 3, axis = -1 + # [0, 1, -1, -1] --> [0, 1, -1, -1], [0, 1, -1, -1], [0, 1, -1, -1], [0, 1, -1, -1] + self.rule = get_spmd_rule("split_with_num") + self.attrs = {} + self.attrs['num'] = 3 + self.attrs['axis'] = -1 + self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 3) + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [0, 1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [0, 1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[2].dims_mapping, [0, 1, -1, -1] + ) + + # num_or_sections = [32, 32, 32], axis = 0 + # [0, 1, -1, -1] --> [-1, 1, -1, -1], [-1, 1, -1, -1], [-1, 1, -1, -1], [-1, 1, -1, -1] + self.rule = get_spmd_rule("split") + self.attrs = {} + self.attrs['sections'] = [32, 32, 32] + self.attrs['axis'] = 0 + self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 3) + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, 1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, 1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[1].dims_mapping, [-1, 1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[2].dims_mapping, [-1, 1, -1, -1] + ) + + +if __name__ == "__main__": + unittest.main() From 4ff8fca52c7bb2507aba4c21cefae3f09de2a527 Mon Sep 17 00:00:00 2001 From: umiswing Date: Mon, 24 Jul 2023 11:07:22 +0800 Subject: [PATCH 07/34] Fix test_sparse_norm_op failure. (#55405) * Fix test failed on cudnn. * Fix codestyle. --- test/legacy_test/test_sparse_norm_op.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/legacy_test/test_sparse_norm_op.py b/test/legacy_test/test_sparse_norm_op.py index c17a252ee75a6..25a253d9787b4 100644 --- a/test/legacy_test/test_sparse_norm_op.py +++ b/test/legacy_test/test_sparse_norm_op.py @@ -100,19 +100,23 @@ def check(self, shape): else: bn = paddle.nn.BatchNorm3D(shape[-1], data_format=data_format) y = bn(x) - y.backward() + np.random.seed(5) + loss_data = np.random.uniform(-0.01, 0.01, y.shape).astype("float32") + loss = paddle.to_tensor(loss_data) + y.backward(loss) sp_x = paddle.to_tensor(data).to_sparse_coo(dim - 1) sp_x.stop_gradient = False sp_bn = paddle.sparse.nn.BatchNorm(shape[-1], data_format=data_format) sp_y = sp_bn(sp_x) - sp_y.backward() + sp_loss = loss.to_sparse_coo(dim - 1) + sp_y.backward(sp_loss) np.testing.assert_allclose( - y.numpy(), sp_y.to_dense().numpy(), rtol=1e-5 + sp_y.to_dense().numpy(), y.numpy(), rtol=1e-5 ) np.testing.assert_allclose( - x.grad.numpy(), sp_x.grad.to_dense().numpy(), rtol=1e-5 + sp_x.grad.to_dense().numpy(), x.grad.numpy(), rtol=1e-5 ) def test_nd(self): From 5b8f06377cd1199b6f9c96e7289bced90a9740c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Mon, 24 Jul 2023 11:22:06 +0800 Subject: [PATCH 08/34] delete modification on pre-commit (#55519) --- paddle/cinn/auto_schedule/analysis/analyze_ir.cc | 2 +- paddle/cinn/auto_schedule/cost_model/feature_extractor.cc | 2 +- paddle/cinn/auto_schedule/database/jsonfile_database_test.cc | 2 +- .../cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc | 2 +- .../auto_schedule/search_space/auto_gen_rule/auto_inline.cc | 2 +- .../auto_schedule/search_space/auto_gen_rule/auto_unroll.cc | 2 +- .../search_space/auto_gen_rule/multi_level_tiling.cc | 2 +- .../cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc | 2 +- paddle/cinn/auto_schedule/search_space/search_space.cc | 2 +- .../cinn/auto_schedule/search_strategy/evolutionary_search.cc | 2 +- paddle/cinn/auto_schedule/task/task_optimizer.cc | 2 +- paddle/cinn/auto_schedule/task/task_registry.h | 2 +- paddle/cinn/backends/codegen_cuda_util.h | 2 +- paddle/cinn/common/cas.cc | 2 +- paddle/cinn/hlir/pe/broadcast.cc | 2 +- paddle/cinn/hlir/pe/nn.cc | 2 +- paddle/cinn/ir/schedule/ir_schedule.cc | 2 +- paddle/cinn/ir/schedule/ir_schedule_util.cc | 2 +- paddle/cinn/ir/test/CMakeLists.txt | 1 + paddle/cinn/{optim => ir/test}/ir_copy_test.cc | 2 +- paddle/cinn/ir/test/schedule_desc_test.cc | 2 +- paddle/cinn/ir/utils/CMakeLists.txt | 3 ++- paddle/cinn/{optim => ir/utils}/ir_copy.cc | 2 +- paddle/cinn/{optim => ir/utils}/ir_copy.h | 0 paddle/cinn/optim/CMakeLists.txt | 2 -- paddle/cinn/optim/compute_inline_expand.cc | 2 +- paddle/cinn/optim/ir_replace.cc | 2 +- paddle/cinn/optim/optimize.cc | 2 +- paddle/cinn/optim/replace_call_with_expr.cc | 2 +- paddle/cinn/optim/replace_var_with_expr.cc | 2 +- paddle/cinn/optim/transform_gpu_forloop.cc | 2 +- paddle/cinn/optim/transform_polyfor_to_for.cc | 2 +- paddle/cinn/optim/unroll_loops.cc | 2 +- paddle/cinn/optim/vectorize_loops.cc | 2 +- paddle/cinn/poly/stage.cc | 2 +- paddle/cinn/pybind/optim.cc | 2 +- 36 files changed, 35 insertions(+), 35 deletions(-) mode change 100755 => 100644 paddle/cinn/backends/codegen_cuda_util.h rename paddle/cinn/{optim => ir/test}/ir_copy_test.cc (95%) rename paddle/cinn/{optim => ir/utils}/ir_copy.cc (99%) rename paddle/cinn/{optim => ir/utils}/ir_copy.h (100%) mode change 100755 => 100644 paddle/cinn/optim/ir_replace.cc diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc index 8adc61ba269b4..17aad495b246a 100644 --- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc +++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc @@ -26,10 +26,10 @@ #include "paddle/cinn/ir/lowered_func.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/lang/lower.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/optimize.h" #include "paddle/cinn/optim/transform_gpu_forloop.h" diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc index 3369ef4b6ba8b..db2d3f62ed6a9 100644 --- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc +++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc @@ -34,8 +34,8 @@ #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/transform_polyfor_to_for.h" namespace cinn { diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc index d977e02dc15c2..9a7bf9d568bc3 100644 --- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc +++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc @@ -24,8 +24,8 @@ #include "paddle/cinn/auto_schedule/task/task_registry.h" #include "paddle/cinn/cinn.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace auto_schedule { diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc index 3fd38ab0d4ba6..06215d98d8b27 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc @@ -17,9 +17,9 @@ #include #include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace auto_schedule { diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc index db5f8e711d8d9..946947611f35d 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc @@ -27,9 +27,9 @@ #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace auto_schedule { diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc index 3ea7882b80654..946bd9e9d7730 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc @@ -19,9 +19,9 @@ #include #include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace auto_schedule { diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc index b521db598fca0..feecf79a07d10 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc @@ -31,9 +31,9 @@ #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" #include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace auto_schedule { diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc index 9ad9dd143b494..7810822299c8c 100644 --- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc +++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc @@ -19,7 +19,7 @@ #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h" #include "paddle/cinn/common/target.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" namespace cinn { namespace auto_schedule { diff --git a/paddle/cinn/auto_schedule/search_space/search_space.cc b/paddle/cinn/auto_schedule/search_space/search_space.cc index f3f8ba2f673f9..9ceab068c76d4 100644 --- a/paddle/cinn/auto_schedule/search_space/search_space.cc +++ b/paddle/cinn/auto_schedule/search_space/search_space.cc @@ -31,7 +31,7 @@ #include "paddle/cinn/auto_schedule/task/tune_task.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/runtime/flags.h" DECLARE_bool(auto_schedule_use_cost_model); diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc index d139cc4c1d309..c2668e1da72d3 100644 --- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc +++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc @@ -31,7 +31,7 @@ #include "paddle/cinn/auto_schedule/task/task_registry.h" #include "paddle/cinn/auto_schedule/task/tune_task.h" #include "paddle/cinn/auto_schedule/tuning.h" -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/utils/multi_threading.h" #include "paddle/cinn/utils/sized_multi_set.h" #include "paddle/cinn/utils/string.h" diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc index cd89eedbb75d1..c4a03bde62464 100644 --- a/paddle/cinn/auto_schedule/task/task_optimizer.cc +++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc @@ -30,7 +30,7 @@ #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/optim/transform_gpu_forloop.h" #include "paddle/cinn/runtime/flags.h" #include "paddle/cinn/utils/string.h" diff --git a/paddle/cinn/auto_schedule/task/task_registry.h b/paddle/cinn/auto_schedule/task/task_registry.h index 158a899e5b915..c506a7ea332ef 100644 --- a/paddle/cinn/auto_schedule/task/task_registry.h +++ b/paddle/cinn/auto_schedule/task/task_registry.h @@ -20,7 +20,7 @@ #include #include "paddle/cinn/ir/schedule/ir_schedule.h" -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/utils/registry.h" namespace cinn { diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h old mode 100755 new mode 100644 index a0f892ca58c02..802d93d9a3144 --- a/paddle/cinn/backends/codegen_cuda_util.h +++ b/paddle/cinn/backends/codegen_cuda_util.h @@ -22,8 +22,8 @@ #include "paddle/cinn/cinn.h" #include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace backends { diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc index ff867858d80f8..b72650301bbfe 100644 --- a/paddle/cinn/common/cas.cc +++ b/paddle/cinn/common/cas.cc @@ -22,12 +22,12 @@ #include "paddle/cinn/common/arithmatic.h" #include "paddle/cinn/common/ir_util.h" #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/ir/utils/ir_visitor.h" #include "paddle/cinn/optim/cast_simplify.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/utils/string.h" namespace cinn { diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc index 29abc69587f5e..30ed792869276 100644 --- a/paddle/cinn/hlir/pe/broadcast.cc +++ b/paddle/cinn/hlir/pe/broadcast.cc @@ -20,9 +20,9 @@ #include "paddle/cinn/hlir/op/op_util.h" #include "paddle/cinn/ir/ir_base.h" #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/lang/builtin.h" #include "paddle/cinn/lang/compute.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace hlir { diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc index 26929568c901f..030cd7472e430 100644 --- a/paddle/cinn/hlir/pe/nn.cc +++ b/paddle/cinn/hlir/pe/nn.cc @@ -29,9 +29,9 @@ #include "paddle/cinn/hlir/pe/nn_util.h" #include "paddle/cinn/hlir/pe/schedule.h" #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/lang/builtin.h" #include "paddle/cinn/lang/compute.h" -#include "paddle/cinn/optim/ir_copy.h" namespace cinn { namespace hlir { diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc index af0db05f36af0..88609c7a7eb9b 100644 --- a/paddle/cinn/ir/schedule/ir_schedule.cc +++ b/paddle/cinn/ir/schedule/ir_schedule.cc @@ -32,11 +32,11 @@ #include "paddle/cinn/ir/op/ir_operators.h" #include "paddle/cinn/ir/schedule/ir_schedule_error.h" #include "paddle/cinn/ir/schedule/ir_schedule_util.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/ir/utils/ir_visitor.h" #include "paddle/cinn/lang/compute.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/replace_var_with_expr.h" #include "paddle/cinn/utils/string.h" diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc index 70bc7b0415b88..adfe5fdcef861 100644 --- a/paddle/cinn/ir/schedule/ir_schedule_util.cc +++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc @@ -27,11 +27,11 @@ #include "paddle/cinn/common/ir_util.h" #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/ir/utils/ir_visitor.h" #include "paddle/cinn/lang/compute.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/replace_var_with_expr.h" diff --git a/paddle/cinn/ir/test/CMakeLists.txt b/paddle/cinn/ir/test/CMakeLists.txt index de5816d9e808d..bef31ed067e3b 100644 --- a/paddle/cinn/ir/test/CMakeLists.txt +++ b/paddle/cinn/ir/test/CMakeLists.txt @@ -16,3 +16,4 @@ cinn_cc_test(test_intrinsic_ops SRCS intrinsic_ops_test.cc DEPS cinncore) cinn_cc_test(test_ir_verify SRCS ir_verify_test.cc DEPS cinncore) cinn_cc_test(test_schedule_desc SRCS schedule_desc_test.cc DEPS cinncore) cinn_cc_test(test_ir_compare SRCS ir_compare_test.cc DEPS cinncore) +cinn_cc_test(test_ir_copy SRCS ir_copy_test.cc DEPS cinncore) diff --git a/paddle/cinn/optim/ir_copy_test.cc b/paddle/cinn/ir/test/ir_copy_test.cc similarity index 95% rename from paddle/cinn/optim/ir_copy_test.cc rename to paddle/cinn/ir/test/ir_copy_test.cc index 91128264853a1..cd3199d4947dd 100644 --- a/paddle/cinn/optim/ir_copy_test.cc +++ b/paddle/cinn/ir/test/ir_copy_test.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include diff --git a/paddle/cinn/ir/test/schedule_desc_test.cc b/paddle/cinn/ir/test/schedule_desc_test.cc index 1cefd9a1b1971..66a01b14b08d7 100644 --- a/paddle/cinn/ir/test/schedule_desc_test.cc +++ b/paddle/cinn/ir/test/schedule_desc_test.cc @@ -20,9 +20,9 @@ #include "paddle/cinn/cinn.h" #include "paddle/cinn/common/context.h" #include "paddle/cinn/ir/schedule/ir_schedule.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/lang/lower.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/utils/string.h" #include "paddle/cinn/utils/type_defs.h" diff --git a/paddle/cinn/ir/utils/CMakeLists.txt b/paddle/cinn/ir/utils/CMakeLists.txt index 58d9c3b59dc1a..5613bf7260155 100644 --- a/paddle/cinn/ir/utils/CMakeLists.txt +++ b/paddle/cinn/ir/utils/CMakeLists.txt @@ -8,4 +8,5 @@ gather_srcs( ir_printer.cc ir_verify.cc ir_compare.cc - ir_nodes_collector.cc) + ir_nodes_collector.cc + ir_copy.cc) diff --git a/paddle/cinn/optim/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc similarity index 99% rename from paddle/cinn/optim/ir_copy.cc rename to paddle/cinn/ir/utils/ir_copy.cc index 3bf7eb323a8fe..22d7c99bcd322 100644 --- a/paddle/cinn/optim/ir_copy.cc +++ b/paddle/cinn/ir/utils/ir_copy.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include #include diff --git a/paddle/cinn/optim/ir_copy.h b/paddle/cinn/ir/utils/ir_copy.h similarity index 100% rename from paddle/cinn/optim/ir_copy.h rename to paddle/cinn/ir/utils/ir_copy.h diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt index c8313a91af439..b6d3632d4bc7e 100755 --- a/paddle/cinn/optim/CMakeLists.txt +++ b/paddle/cinn/optim/CMakeLists.txt @@ -5,7 +5,6 @@ gather_srcs( SRCS remove_nested_block.cc replace_call_with_expr.cc - ir_copy.cc ir_replace.cc replace_var_with_expr.cc tensor_write_tell.cc @@ -38,7 +37,6 @@ endif() cinn_cc_test(test_remove_nested_block SRCS remove_nested_block_test.cc DEPS cinncore) -cinn_cc_test(test_ir_copy SRCS ir_copy_test.cc DEPS cinncore) cinn_cc_test(test_ir_simplify SRCS ir_simplify_test.cc DEPS cinncore) cinn_cc_test(test_replace_call_with_expr SRCS replace_call_with_expr_test.cc DEPS cinncore) diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc index d18d5a45f6375..8dad52ab4d9bc 100644 --- a/paddle/cinn/optim/compute_inline_expand.cc +++ b/paddle/cinn/optim/compute_inline_expand.cc @@ -18,8 +18,8 @@ #include #include "paddle/cinn/common/graph_utils.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/replace_var_with_expr.h" namespace cinn { diff --git a/paddle/cinn/optim/ir_replace.cc b/paddle/cinn/optim/ir_replace.cc old mode 100755 new mode 100644 index 815fe4a15b2d9..3dc39a08a3817 --- a/paddle/cinn/optim/ir_replace.cc +++ b/paddle/cinn/optim/ir_replace.cc @@ -16,9 +16,9 @@ #include +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/utils/string.h" namespace cinn { diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc index 2acf8c331d11a..f7ff0643cdeb3 100644 --- a/paddle/cinn/optim/optimize.cc +++ b/paddle/cinn/optim/optimize.cc @@ -15,6 +15,7 @@ #include "paddle/cinn/optim/optimize.h" #include "paddle/cinn/ir/schedule/ir_schedule_util.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/optim/call_arg_list_to_pod_value.h" #include "paddle/cinn/optim/cast_bool_to_int8.h" @@ -24,7 +25,6 @@ #include "paddle/cinn/optim/fold_cinn_call_arguments.h" #include "paddle/cinn/optim/if_simplify.h" #include "paddle/cinn/optim/insert_debug_log_callee.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/lower_function_call_bind_vars.h" #include "paddle/cinn/optim/lower_intrin.h" diff --git a/paddle/cinn/optim/replace_call_with_expr.cc b/paddle/cinn/optim/replace_call_with_expr.cc index 26dc7662ec5ac..d63210d1d28f1 100644 --- a/paddle/cinn/optim/replace_call_with_expr.cc +++ b/paddle/cinn/optim/replace_call_with_expr.cc @@ -14,9 +14,9 @@ #include "paddle/cinn/optim/replace_call_with_expr.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/replace_var_with_expr.h" namespace cinn { diff --git a/paddle/cinn/optim/replace_var_with_expr.cc b/paddle/cinn/optim/replace_var_with_expr.cc index a8c9f8cd961d0..57ab47d7c0250 100644 --- a/paddle/cinn/optim/replace_var_with_expr.cc +++ b/paddle/cinn/optim/replace_var_with_expr.cc @@ -18,9 +18,9 @@ #include "paddle/cinn/ir/ir.h" #include "paddle/cinn/ir/op/ir_operators.h" #include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/replace_const_param_to_integer.h" diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc index 0b95313034ba2..d12a5c9f2dab8 100644 --- a/paddle/cinn/optim/transform_gpu_forloop.cc +++ b/paddle/cinn/optim/transform_gpu_forloop.cc @@ -24,9 +24,9 @@ #include "paddle/cinn/common/cas.h" #include "paddle/cinn/common/ir_util.h" #include "paddle/cinn/ir/ir.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/replace_var_with_expr.h" #include "paddle/cinn/poly/isl_utils.h" diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc index 82d500d5dca06..5efc0a1f44be3 100644 --- a/paddle/cinn/optim/transform_polyfor_to_for.cc +++ b/paddle/cinn/optim/transform_polyfor_to_for.cc @@ -22,10 +22,10 @@ #include "paddle/cinn/common/ir_util.h" #include "paddle/cinn/common/type.h" #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/ir/utils/ir_visitor.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_simplify.h" namespace cinn { diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc index 0510774805992..fc5fab85eca5f 100644 --- a/paddle/cinn/optim/unroll_loops.cc +++ b/paddle/cinn/optim/unroll_loops.cc @@ -18,9 +18,9 @@ #include #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_replace.h" namespace cinn { diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc index e15ab71d46729..745bec47b4507 100644 --- a/paddle/cinn/optim/vectorize_loops.cc +++ b/paddle/cinn/optim/vectorize_loops.cc @@ -26,9 +26,9 @@ #include "paddle/cinn/common/cas.h" #include "paddle/cinn/common/ir_util.h" #include "paddle/cinn/ir/op/ir_operators.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_replace.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/tensor_write_tell.h" diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc index e5faafc5348fc..faa7a99c0cfde 100644 --- a/paddle/cinn/poly/stage.cc +++ b/paddle/cinn/poly/stage.cc @@ -24,12 +24,12 @@ #include "paddle/cinn/common/axis.h" #include "paddle/cinn/ir/operation.h" #include "paddle/cinn/ir/tensor.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/ir/utils/ir_mutator.h" #include "paddle/cinn/ir/utils/ir_nodes_collector.h" #include "paddle/cinn/ir/utils/ir_printer.h" #include "paddle/cinn/ir/utils/ir_visitor.h" #include "paddle/cinn/lang/compute.h" -#include "paddle/cinn/optim/ir_copy.h" #include "paddle/cinn/optim/ir_replace.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/optim/replace_var_with_expr.h" diff --git a/paddle/cinn/pybind/optim.cc b/paddle/cinn/pybind/optim.cc index dfb159a23f352..00219477e8f85 100755 --- a/paddle/cinn/pybind/optim.cc +++ b/paddle/cinn/pybind/optim.cc @@ -18,7 +18,7 @@ #include "paddle/cinn/common/target.h" #include "paddle/cinn/common/type.h" #include "paddle/cinn/ir/op/ir_operators.h" -#include "paddle/cinn/optim/ir_copy.h" +#include "paddle/cinn/ir/utils/ir_copy.h" #include "paddle/cinn/optim/ir_simplify.h" #include "paddle/cinn/pybind/bind.h" #include "paddle/cinn/pybind/bind_utils.h" From bea1f04c300ab932fd96f27ee1304f507e7bb6ba Mon Sep 17 00:00:00 2001 From: Xinyu Chen Date: Mon, 24 Jul 2023 06:33:45 +0000 Subject: [PATCH 09/34] onednn: remove fc_elementwise_add fusion (#55504) * onednn: remove fc+eltwiseadd fusion pass * onednn: remove post-sum fusion in fc kernel * onednn: tests: make unfused add run into f32 --- paddle/fluid/framework/ir/CMakeLists.txt | 1 - .../fc_elementwise_add_mkldnn_fuse_pass.cc | 165 ------------------ .../fc_elementwise_add_mkldnn_fuse_pass.h | 48 ----- .../inference/api/paddle_pass_builder.cc | 7 +- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 35 +--- .../api/analysis_predictor_tester.cc | 4 +- .../api/analyzer_ernie_int8_tester.cc | 2 +- .../inference/api/analyzer_vit_ocr_tester.cc | 1 - test/ir/inference/CMakeLists.txt | 2 - ...est_onednn_fc_elementwise_add_fuse_pass.py | 104 ----------- 10 files changed, 5 insertions(+), 364 deletions(-) delete mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc delete mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h delete mode 100644 test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index f4b88178499be..fa0fa7fccfd61 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -174,7 +174,6 @@ if(WITH_MKLDNN) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn) pass_library(params_quantization_mkldnn_pass inference DIR mkldnn) - pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) pass_library(scale_matmul_fuse_pass inference DIR mkldnn) pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn) pass_library(cpu_bfloat16_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc deleted file mode 100644 index ef01acd88c0b7..0000000000000 --- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h" - -#include "paddle/fluid/framework/ir/graph_traits.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/utils/string/pretty_log.h" - -namespace paddle { -namespace framework { -namespace ir { - -FCResidualConnectionMKLDNNFusePass::FCResidualConnectionMKLDNNFusePass() { - AddOpCompat(OpCompat("fc")) - .AddInput("Input") - .IsTensor() - .End() - .AddInput("W") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("in_num_col_dims") - .IsNumGE(1) - .End(); - - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .IsIntIn({-1, 0, 1}) - .End(); -} - -GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC( - const std::string& name_scope, - const GraphWithStats& graph_with_stats, - bool fc_as_x) const { - GraphPatternDetector gpd; - auto pattern = gpd.mutable_pattern(); - patterns::FCMKLDNN fc_pattern{pattern, name_scope}; - auto fc_output = fc_pattern(false /* with residual */); - - patterns::ResidualElementwise elementwise_pattern{ - pattern, name_scope, fc_as_x}; - elementwise_pattern( - fc_output, - pattern->NewNode(elementwise_pattern.residual_data_repr()), - "elementwise_add", - fc_as_x); - fc_output->AsIntermediate(); - - int found_fc_count = 0; - - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - VLOG(4) << "Fuse fc + elementwise_add as residual"; - GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc, fc_pattern); - GET_IR_NODE_FROM_SUBGRAPH(fc_input, input, fc_pattern); - GET_IR_NODE_FROM_SUBGRAPH(fc_weights, weights, fc_pattern); - GET_IR_NODE_FROM_SUBGRAPH(fc_output, output, fc_pattern); - - GET_IR_NODE_FROM_SUBGRAPH( - elementwise_op, elementwise_op, elementwise_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - residual_data, residual_data, elementwise_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - elementwise_out, elementwise_out, elementwise_pattern); - - if (FindFuseOption(*fc_op, *elementwise_op) != FUSE_MKLDNN) { - VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id() - << ") with " << elementwise_op->Name() << "(" - << elementwise_op->id() - << ") because not both ops have use_mkldnn"; - return; - } - if (!IsReachable(g, residual_data, fc_output)) { - VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id() - << ") with " << elementwise_op->Name() << "(" - << elementwise_op->id() << ") because residual input " - << residual_data->Name() << "(" << residual_data->id() - << ") is not " - "reachable"; - return; - } - if (HasFusedActivation(fc_op)) { - VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id() - << ") with " << elementwise_op->Name() << "(" - << elementwise_op->id() << ") because fc has activation fused"; - return; - } - - if (!IsCompat(subgraph, g)) { - LOG(WARNING) - << "op compat for fc_elementwise_add_mkldnn_fuse_pass failed."; - return; - } - - fc_op->Op()->SetInput("ResidualData", {residual_data->Name()}); - fc_op->Op()->SetOutput("Out", {elementwise_out->Name()}); - fc_op->Op()->SetAttr("fuse_residual_connection", true); - - GraphSafeRemoveNodes(g, {fc_output, elementwise_op}); - - IR_NODE_LINK_TO(residual_data, fc_op); - IR_NODE_LINK_TO(fc_op, elementwise_out); - - found_fc_count++; - }; - - gpd(graph_with_stats.first, handler); - if ((!Has("disable_logs") || !Get("disable_logs")) && - (found_fc_count > 0)) { - std::stringstream msg_ss; - std::string fusionMode = fc_as_x ? "x" : "y"; - msg_ss << "--- Fused " << found_fc_count << " fc (as " << fusionMode - << ") + elementwise_add patterns"; - paddle::string::PrettyLogDetail(msg_ss.str().c_str()); - } - - return std::make_pair(graph_with_stats.first, - found_fc_count + graph_with_stats.second); -} - -void FCResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const { - FusePassBase::Init(name_scope_, graph); - auto graph_with_stats = FuseFC(name_scope_, std::make_pair(graph, 0), true); - graph_with_stats = FuseFC(name_scope_, graph_with_stats, false); - - AddStatis(graph_with_stats.second); -} -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(fc_elementwise_add_mkldnn_fuse_pass, - paddle::framework::ir::FCResidualConnectionMKLDNNFusePass); -REGISTER_PASS_CAPABILITY(fc_elementwise_add_mkldnn_fuse_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("fc", 0) - .LE("elementwise_add", 1)); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h deleted file mode 100644 index f92ce5bfc7044..0000000000000 --- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" - -namespace paddle { -namespace framework { -namespace ir { - -using GraphWithStats = std::pair; - -class FCResidualConnectionMKLDNNFusePass : public FusePassBase { - private: - GraphWithStats FuseFC(const std::string& name_scope, - const GraphWithStats& graph_with_stats, - bool fc_as_x) const; - - public: - FCResidualConnectionMKLDNNFusePass(); - virtual ~FCResidualConnectionMKLDNNFusePass() {} - - protected: - void ApplyImpl(ir::Graph* graph) const; - - static bool HasFusedActivation(Node* fc_node) { - return !( - fc_node->Op()->GetAttrIfExists("activation_type").empty()); - } - - const std::string name_scope_{"fc_elementwise_add_mkldnn_fuse"}; -}; -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index b2a8fc86c2305..18cdf2c624545 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -372,7 +372,6 @@ void CpuPassStrategy::EnableMKLDNN() { // Disabled due to topology-dependent speed-up "fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass", - "fc_elementwise_add_mkldnn_fuse_pass", // "self_attention_fuse_pass", // "batch_norm_act_fuse_pass", // "softplus_activation_onednn_fuse_pass", // @@ -407,7 +406,6 @@ void CpuPassStrategy::EnableMkldnnBfloat16() { if (!use_mkldnn_bfloat16_) { passes_.push_back("fc_mkldnn_pass"); passes_.push_back("fc_act_mkldnn_fuse_pass"); - passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass"); passes_.push_back("cpu_bfloat16_placement_pass"); passes_.push_back("cpu_bfloat16_pass"); @@ -463,7 +461,6 @@ void CpuPassStrategy::EnableMkldnnInt8() { passes_.push_back("repeated_fc_relu_fuse_pass"); passes_.push_back("fc_mkldnn_pass"); passes_.push_back("fc_act_mkldnn_fuse_pass"); - passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass"); passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass"); passes_.push_back("batch_norm_act_fuse_pass"); passes_.push_back("softplus_activation_onednn_fuse_pass"); @@ -498,9 +495,7 @@ void CpuPassStrategy::DisableMkldnnFcPasses() { void CpuPassStrategy::EraseFcMkldnnPasses() { std::vector fc_passes_to_erase( - {"fc_mkldnn_pass", - "fc_act_mkldnn_fuse_pass", - "fc_elementwise_add_mkldnn_fuse_pass"}); + {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"}); for (const auto &pass : fc_passes_to_erase) { int idx = GetPassIndex(pass); if (idx != -1) { diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 5cf06bcbbdcf0..18680fe678b5d 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -46,9 +46,6 @@ GetDNNLScales(const ExecutionContext& ctx) { auto scale_in_data = ctx.Attr("Scale_in"); auto scale_out = ctx.Attr("Scale_out"); auto scale_weights_data = ctx.Attr>("Scale_weights"); - auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise") - ? ctx.Attr("Scale_in_eltwise") - : 1.0f; std::vector dnnl_src_scales = {1.f / scale_in_data}; size_t count = scale_weights_data.size(); @@ -57,7 +54,7 @@ GetDNNLScales(const ExecutionContext& ctx) { for (size_t i = 0; i < count; i++) { dnnl_wei_scales[i] = 1.f / scale_weights_data[i]; } - std::vector dnnl_psum_scales = {1.f / scale_in_eltwise_data}; + std::vector dnnl_psum_scales = {1.f}; std::vector dnnl_dst_scales = {1.f / scale_out}; return std::make_tuple( @@ -127,7 +124,6 @@ class FCMKLDNNHandler dnnl::primitive_attr attributes; dnnl::post_ops post_operations; - float sum_scale = 1.0f; float activation_scale = 1.0f; if (phi::funcs::is_int8()) { std::vector src_scales, wei_scales, psum_scales, dst_scales; @@ -168,13 +164,6 @@ class FCMKLDNNHandler dst_scales.data(), dst_scales.size() * sizeof(float)); } - - sum_scale = psum_scales[0]; - } - - if (ctx.HasAttr("fuse_residual_connection") && - ctx.Attr("fuse_residual_connection")) { - post_operations.append_sum(sum_scale); } // ReLU from "fc_fuse_pass" @@ -332,22 +321,6 @@ class FCMKLDNNHandler std::shared_ptr AcquireCustomDstMemory( const ExecutionContext& ctx, phi::DenseTensor* out) { - if (ctx.HasAttr("fuse_residual_connection") && - ctx.Attr("fuse_residual_connection")) { - auto* residual_param = ctx.Input("ResidualData"); - - PADDLE_ENFORCE_EQ( - out->dims(), - residual_param->dims(), - phi::errors::InvalidArgument( - "Output and elementwise parameter need to have the " - "same dimension sizes, but got output's dimension = %d" - " and residual param's dimension =%d .", - out->dims().size(), - residual_param->dims().size())); - - out->ShareDataWith(*residual_param); - } return this->template AcquireDstMemory(out); } // namespace operators @@ -458,11 +431,7 @@ class FCMKLDNNKernel : public framework::OpKernel { dst_memory_p = std::make_shared(inner_product_cache->dst_mem); - if (ctx.HasAttr("fuse_residual_connection") && - ctx.Attr("fuse_residual_connection")) { - auto* residual_param = ctx.Input("ResidualData"); - out->ShareDataWith(*residual_param); - } + auto out_ptr = out->mutable_data( ctx.GetPlace(), dst_memory_p->get_desc().get_size()); dst_memory_p->set_data_handle(out_ptr); diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc index 6e3497d14a0dd..0827c3c196c19 100644 --- a/test/cpp/inference/api/analysis_predictor_tester.cc +++ b/test/cpp/inference/api/analysis_predictor_tester.cc @@ -370,9 +370,7 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) { CpuPassStrategy cpuPassStrategy; cpuPassStrategy.EnableMKLDNN(); const std::vector fc_passes_to_erase( - {"fc_mkldnn_pass", - "fc_act_mkldnn_fuse_pass", - "fc_elementwise_add_mkldnn_fuse_pass"}); + {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"}); for (const auto& pass : fc_passes_to_erase) { ASSERT_NE(cpuPassStrategy.GetPassIndex(pass), (size_t)-1); } diff --git a/test/cpp/inference/api/analyzer_ernie_int8_tester.cc b/test/cpp/inference/api/analyzer_ernie_int8_tester.cc index a93bbb1ab7e24..f8e7eb9ee8ad5 100644 --- a/test/cpp/inference/api/analyzer_ernie_int8_tester.cc +++ b/test/cpp/inference/api/analyzer_ernie_int8_tester.cc @@ -34,7 +34,7 @@ void SetInt8Config(AnalysisConfig *cfg, pass_builder->DeletePass("constant_folding_pass"); auto warmup_data = std::make_shared>(data); cfg->mkldnn_quantizer_config()->SetEnabledOpTypes( - {"elementwise_add", "matmul", "matmul_v2", "fused_matmul"}); + {"matmul", "matmul_v2", "fused_matmul"}); // Exclusion of several matmules that should not be quantized due to the fact // that they reduce the accuracy of the model cfg->mkldnn_quantizer_config()->SetExcludedOpIds( diff --git a/test/cpp/inference/api/analyzer_vit_ocr_tester.cc b/test/cpp/inference/api/analyzer_vit_ocr_tester.cc index 3582fc22c9cc2..43457a05b4b48 100644 --- a/test/cpp/inference/api/analyzer_vit_ocr_tester.cc +++ b/test/cpp/inference/api/analyzer_vit_ocr_tester.cc @@ -104,7 +104,6 @@ TEST(Analyzer_vit_ocr, fuse_status) { CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33); CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2); - CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16); } #endif diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt index 1932ad37490ff..1e7fcfa6a1005 100755 --- a/test/ir/inference/CMakeLists.txt +++ b/test/ir/inference/CMakeLists.txt @@ -301,8 +301,6 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300) set_tests_properties(test_onednn_fc_activation_fuse_pass PROPERTIES TIMEOUT 300) - set_tests_properties(test_onednn_fc_elementwise_add_fuse_pass - PROPERTIES TIMEOUT 120) set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass PROPERTIES TIMEOUT 60) endif() diff --git a/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py b/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py deleted file mode 100644 index 9c27452aff5c4..0000000000000 --- a/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest -from functools import partial - -import hypothesis.strategies as st -import numpy as np -from auto_scan_test import PassAutoScanTest -from program_config import OpConfig, ProgramConfig, TensorConfig - - -class TestFCElementwiseAddOneDNNFusePass(PassAutoScanTest): - def sample_program_config(self, draw): - axis = draw(st.sampled_from([-1, 0, 1])) - fc_as_x = draw(st.booleans()) - fc_in = draw(st.sampled_from([32, 64])) - fc_wei = draw(st.sampled_from([32, 64])) - - def generate_data(shape): - return np.random.random(shape).astype(np.float32) - - relu_op = OpConfig( - type='relu', - inputs={'X': ['input_data']}, - outputs={'Out': ['relu_out']}, - attrs={}, - ) - - fc_op = OpConfig( - type='fc', - inputs={ - 'Input': ['relu_out'], - 'W': ['fc_weight'], - 'Bias': ['fc_bias'], - }, - outputs={'Out': ['fc_output']}, - attrs={ - 'use_mkldnn': True, - 'padding_weights': False, - 'activation_type': '', - 'in_num_col_dims': 1, - }, - ) - - if fc_as_x: - inputs = {'X': ['fc_output'], 'Y': ['input_data']} - else: - inputs = {'X': ['input_data'], 'Y': ['fc_output']} - - elt_add_op = OpConfig( - type='elementwise_add', - inputs=inputs, - outputs={'Out': ['elementwise_output']}, - attrs={'axis': axis, 'use_mkldnn': True}, - ) - - model_net = [relu_op, fc_op, elt_add_op] - - program_config = ProgramConfig( - ops=model_net, - weights={ - 'fc_weight': TensorConfig( - data_gen=partial(generate_data, [fc_wei, fc_wei]) - ), - 'fc_bias': TensorConfig( - data_gen=partial(generate_data, [fc_wei]) - ), - }, - inputs={ - 'input_data': TensorConfig( - data_gen=partial(generate_data, [fc_in, fc_wei]) - ) - }, - outputs=['elementwise_output'], - ) - - return program_config - - def sample_predictor_configs(self, program_config): - config = self.create_inference_config( - use_mkldnn=True, passes=['fc_elementwise_add_mkldnn_fuse_pass'] - ) - yield config, ['relu', 'fc'], (1e-5, 1e-5) - - def test(self): - self.run_and_statis( - quant=False, passes=['fc_elementwise_add_mkldnn_fuse_pass'] - ) - - -if __name__ == '__main__': - unittest.main() From 81bd57c75dd8824146536557c97dadd891c1e927 Mon Sep 17 00:00:00 2001 From: Fisher Date: Mon, 24 Jul 2023 14:36:18 +0800 Subject: [PATCH 10/34] [CINN] Remove threshold in op mapper relu6 (#55611) * Just set threshold to 6 in op mapper relu6 * Remove attrs in op mapper relu6 --- paddle/cinn/frontend/paddle_model_to_program.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/paddle/cinn/frontend/paddle_model_to_program.cc b/paddle/cinn/frontend/paddle_model_to_program.cc index 316712ff40e61..8a9ca73af2654 100644 --- a/paddle/cinn/frontend/paddle_model_to_program.cc +++ b/paddle/cinn/frontend/paddle_model_to_program.cc @@ -398,12 +398,6 @@ void PaddleModelToProgram::AddOpMapper_relu6() { CHECK_EQ(op_desc.Output("Out").size(), 1UL); auto out_name = op_desc.Output("Out").front(); - absl::flat_hash_map attrs; - CHECK(op_desc.HasAttr("threshold")); - CHECK_EQ(op_desc.GetAttr("threshold"), 6.0f) - << "Threshold of Relu6 is not 6! To be implemented."; - attrs["threshold"] = op_desc.GetAttr("threshold"); - auto x = GetVar(TransValidVarName(x_name)); auto out = net_builder_->Relu6(x); AddVar(TransValidVarName(out_name), out); From b10b899cd686ad994f85f8ecb14bc40e9c95dce4 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Mon, 24 Jul 2023 14:44:42 +0800 Subject: [PATCH 11/34] [PHI] add fused_softmax_mask and fused_softmax_mask_grad for CPU. (#55616) --- .../cpu/fused_softmax_mask_grad_kernel.cc | 39 +++++++++ .../fusion/cpu/fused_softmax_mask_kernel.cc | 84 +++++++++++++++++++ test/legacy_test/test_softmax_mask_fuse_op.py | 10 +-- 3 files changed, 125 insertions(+), 8 deletions(-) create mode 100644 paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc create mode 100644 paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc diff --git a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc new file mode 100644 index 0000000000000..eb94d71b956a0 --- /dev/null +++ b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/softmax_grad_kernel.h" + +namespace phi { +namespace fusion { + +template +void FusedSoftmaxMaskGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + SoftmaxGradKernel( + dev_ctx, out, out_grad, 3, x_grad); // axis for softmax +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_softmax_mask_grad, + CPU, + ALL_LAYOUT, + phi::fusion::FusedSoftmaxMaskGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc new file mode 100644 index 0000000000000..5ed56188ceea6 --- /dev/null +++ b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/elementwise_add_kernel.h" +#include "paddle/phi/kernels/softmax_kernel.h" + +namespace phi { +namespace fusion { + +template +void FusedSoftmaxMaskKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& mask, + DenseTensor* out) { + auto x_dim = x.dims(); + auto mask_dim = mask.dims(); + auto query_seq_len = x_dim[2]; + auto key_seq_len = x_dim[3]; + + PADDLE_ENFORCE_GT(query_seq_len, + 1, + phi::errors::InvalidArgument( + "Input x's second last dim must be large than 1 but " + "received the second last dimension of x is %d", + query_seq_len)); + + PADDLE_ENFORCE_EQ(key_seq_len >= 32 && key_seq_len < 8192, + true, + phi::errors::InvalidArgument( + "Input x's last dim must be between [32, 8192) " + "received the last dimension of x is %d", + key_seq_len)); + + PADDLE_ENFORCE_EQ(mask_dim[1], + 1, + phi::errors::InvalidArgument( + "Input mask's second dim must be 1 " + "received the second dimension of mask is %d", + mask_dim[1])); + + // dim of x and mask must be equal + for (size_t idx = 0; idx < 4; ++idx) { + if (idx == 1) continue; + PADDLE_ENFORCE_EQ( + x_dim[idx], + mask_dim[idx], + phi::errors::InvalidArgument( + "Input x's %dth dim should be equal with input mask's %dth dim " + "but " + "received the %dth dimension of x and mask are not equal " + "the %dth dim of x is %d, while the %dth dim of mask is %d.", + idx, + idx, + idx, + idx, + x_dim[idx], + idx, + mask_dim[idx])); + } + DenseTensor t = phi::Add(dev_ctx, x, mask); + SoftmaxKernel(dev_ctx, t, 3, out); // axis for softmax +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_softmax_mask, + CPU, + ALL_LAYOUT, + phi::fusion::FusedSoftmaxMaskKernel, + float, + double) {} diff --git a/test/legacy_test/test_softmax_mask_fuse_op.py b/test/legacy_test/test_softmax_mask_fuse_op.py index 13bf6efe56889..79c6ad8c93525 100644 --- a/test/legacy_test/test_softmax_mask_fuse_op.py +++ b/test/legacy_test/test_softmax_mask_fuse_op.py @@ -51,16 +51,10 @@ def setUp(self): self.outputs = {'Out': rst} def test_check_output(self): - try: - self.check_output_with_place(core.CPUPlace()) - except NotImplementedError: - pass + self.check_output_with_place(core.CPUPlace()) def test_check_grad(self): - try: - self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") - except NotImplementedError: - pass + self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") @unittest.skipIf( From a9f877ff0a547755e44602196f093aebdbae2f37 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Mon, 24 Jul 2023 15:35:47 +0800 Subject: [PATCH 12/34] [sharding stage 1 optim] Sharding comm overlap with backward (#55598) --- .../framework/distributed_strategy.proto | 2 + .../dygraph_sharding_optimizer.py | 50 +++- .../fleet/meta_parallel/pipeline_parallel.py | 12 +- .../fleet/meta_parallel/pp_utils/utils.py | 124 -------- .../fleet/utils/tensor_fusion_helper.py | 273 +++++++++++++++++- ...rid_parallel_sharding_model_with_fusion.py | 2 + test/legacy_test/test_fused_comm_buffer.py | 2 +- 7 files changed, 307 insertions(+), 158 deletions(-) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 74f377fd875de..ce489352d3bcf 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -68,6 +68,8 @@ message PpConfig { message DygraphShardingConfig { optional bool tensor_fusion = 1 [ default = false ]; + optional int32 accumulate_steps = 2 [ default = 1 ]; + optional bool comm_overlap = 3 [ default = false ]; } message HybridConfig { diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py index 63d261e2e3dfe..ccb5bfdcd030a 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py @@ -78,12 +78,23 @@ def __init__(self, optimizer, hcg): self.tensor_fusion = strategy.hybrid_configs[ 'sharding_configs' ].tensor_fusion + self.accumulate_steps = strategy.hybrid_configs[ + 'sharding_configs' + ].accumulate_steps + self.comm_overlap = strategy.hybrid_configs[ + 'sharding_configs' + ].comm_overlap pp_overlap = strategy.hybrid_configs['pp_configs'].sharding_comm_overlap - if self.tensor_fusion: + if self.tensor_fusion or self.comm_overlap: assert ( not pp_overlap ), "Can not enable pp's sharding_comm_overlap and sharding's tensor_fusion at the same time." + self._use_main_grad = hasattr(self._parameter_list[0], "main_grad") + self._rank2decay = {} + self._rank2fused = {} + self._comm_buffers = [] + self._rank2params = self._partition_parameters() self._param2rank = self._map_param_to_rank() @@ -95,25 +106,22 @@ def __init__(self, optimizer, hcg): '_param_groups', self._rank2params[self._sharding_rank] ) else: - self._use_main_grad = hasattr(self._parameter_list[0], "main_grad") - self._rank2decay = {} - self._rank2fused = {} self._tensor_fusion() decay_params = [ p.name for p in self._rank2decay[self._sharding_rank] ] - all_params = self._rank2fused[self._sharding_rank] + fused_params = self._rank2fused[self._sharding_rank] apply_decay_param_fun = lambda x: x in decay_params - params = [] + all_fused_params = [] for v in self._rank2fused.values(): - params += v - self._parameter_list = params - self._param_groups = params + all_fused_params += v + self._parameter_list = all_fused_params + self._param_groups = all_fused_params - self._set_inner_opt_attr('_parameter_list', all_params) - self._set_inner_opt_attr('_param_groups', all_params) + self._set_inner_opt_attr('_parameter_list', fused_params) + self._set_inner_opt_attr('_param_groups', fused_params) origin_decay_param_fun = getattr( self._inner_opt, '_apply_decay_param_fun', None ) @@ -145,11 +153,23 @@ def clear_grad(self, set_to_zero=True): p.clear_gradient(set_to_zero) def _tensor_fusion(self): + comm_group = self._hcg.get_sharding_parallel_group() for i in range(self._sharding_world_size): params = self._rank2params[i] - decay_fused, all_fused = fused_parameters( - params, self._use_main_grad + dst = comm_group.ranks[i] + # TODO(sharding dev): make scale_after_comm a field to be configured by user + decay_fused, all_fused, all_buffer = fused_parameters( + params, + use_main_grad=self._use_main_grad, + fuse_param=True, + comm_overlap=self.comm_overlap, + comm_group=comm_group, + dst=dst, + acc_step=self.accumulate_steps, + scale_after_comm=False, ) + if self.comm_overlap: + self._comm_buffers += all_buffer self._rank2decay[i] = decay_fused self._rank2fused[i] = all_fused for p in all_fused: @@ -199,6 +219,10 @@ def _map_param_to_rank(self): def reduce_gradients(self, parameter_list, hcg): # TODO merge grad / nrank with dp logger.debug("sharding start gradients sync") + if self.comm_overlap: + for buffer in self._comm_buffers: + buffer.scale_grads() + return with framework.no_grad(): sharding_nrank = hcg.get_sharding_parallel_group().nranks for param in parameter_list: diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 6644e2a06e5fe..2038a4c4e4606 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -37,11 +37,11 @@ from .pp_utils import p2p_communication as p2p from paddle.distributed.fleet.utils.tensor_fusion_helper import ( + HOOK_ACTION, + FusedCommBuffer, assign_group_by_size, ) -from .pp_utils.utils import HOOK_ACTION, FusedCommBuffer - __all__ = [] g_shard_use_reduce = int(os.environ.get("FLAGS_shard_use_reduce", 0)) @@ -334,9 +334,11 @@ def register_allreduce_overlap_hook(self, model, comm_group, acc_steps, dp): for dst in fused_parameter_group: parameter_list = fused_parameter_group[dst] - if not dp: + if act != HOOK_ACTION.ALL_REDUCE: # parse the relative dst rank to absolute dst rank for sharding dst = comm_group.ranks[dst] + else: + dst = -1 var_groups = assign_group_by_size(parameter_list) for group_idx, parameters in var_groups.items(): buffer = FusedCommBuffer( @@ -515,7 +517,7 @@ def forward_backward_pipeline( if self._comm_overlap: assert len(self._comm_buffers) > 0 for buffer in self._comm_buffers: - buffer.scale_and_split_grads() + buffer.scale_grads() if self._enable_timer: self.timers("allreduce_shared_weight_gradients").start() @@ -1256,7 +1258,7 @@ def forward_backward_pipeline( if self._comm_overlap: assert len(self._comm_buffers) > 0 for buffer in self._comm_buffers: - buffer.scale_and_split_grads() + buffer.scale_grads() if static_scheduler: self._reset_counter() diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index 6c8e2fd9dc3aa..33b8c3d95d582 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -15,19 +15,10 @@ import paddle from paddle import _legacy_C_ops -from paddle.distributed.fleet.utils.tensor_fusion_helper import ( - flatten_dense_tensors, -) -from paddle.framework import base as imperative_base __all__ = [] -class HOOK_ACTION: - ALL_REDUCE = 0 - REDUCE = 1 - - FLOAT_TYPE_DICT = { paddle.float16: "float16", paddle.float32: "float32", @@ -116,118 +107,3 @@ def _all_gather(tensor, group=None, use_calc_stream=True): 'nranks', nranks, ) - - -class FusedCommBuffer: - def __init__(self, id, params, comm_group, acc_steps=1, act=None, dst=-1): - self._id = id - self._params = params - self._acc_steps = acc_steps - self._comm_group = comm_group - - self.use_main_grad = hasattr(self._params[0], "main_grad") - - self._task = None - self._params_step_dict = {} - self._params_checked_in = 0 - self._params_to_addr = {} - - self._act = act - if self._act == HOOK_ACTION.ALL_REDUCE: - assert dst == -1 - elif self._act == HOOK_ACTION.REDUCE: - assert dst != -1 - else: - raise ValueError( - "The act should be allreudce for dp or reduce for sharding." - ) - self._dst = dst - - self._init_step_dict() - - self.grad_storage = flatten_dense_tensors( - self._params, - use_main_grad=self.use_main_grad, - fuse_param=False, - warp_buffer=False, - ).buffer - - self._record_addr() - - def _record_addr(self): - for param in self._params: - addr = ( - param.main_grad.data_ptr() - if self.use_main_grad - else param.grad.data_ptr() - ) - self._params_to_addr[param.name] = addr - - def _init_step_dict(self): - for p in self._params: - self._params_step_dict[p.name] = 0 - - def _reset_params_checked_in(self): - self._task = None - self._init_step_dict() - self._params_checked_in = 0 - - @property - def _all_params_checked_in(self): - return ( - len(self._params) == self._params_checked_in - and len(self._params_step_dict) == 0 - ) - - def add_grad(self, param): - assert param.name in self._params_step_dict - current_ptr = ( - param.main_grad.data_ptr() - if self.use_main_grad - else param.grad.data_ptr() - ) - if self._params_to_addr[param.name] != current_ptr: - raise ValueError( - "The address of the grad/main_grad of the param has been changed during training, " - "which is not allowed for dp/sharding overlap with pp. " - "This may be caused by some non-inplace operations on the grad/main_grad. " - "Please use the inplace version of the operations or disable the overlapping." - ) - - self._params_step_dict[param.name] += 1 - - if self._params_step_dict[param.name] == self._acc_steps: - self._params_checked_in += 1 - self._params_step_dict.pop(param.name) - - if self._all_params_checked_in: - self._comm_grads() - - @imperative_base.no_grad - def _comm_grads(self): - assert self._all_params_checked_in - - if self._act == HOOK_ACTION.ALL_REDUCE: - task = paddle.distributed.all_reduce( - self.grad_storage, group=self._comm_group, sync_op=False - ) - - elif self._act == HOOK_ACTION.REDUCE: - task = paddle.distributed.reduce( - self.grad_storage, - dst=self._dst, - group=self._comm_group, - sync_op=False, - ) - - self._task = task - - @imperative_base.no_grad - def scale_and_split_grads(self): - assert self._task is not None - self._task.wait() - - scale_factor = 1.0 / self._comm_group.nranks - self.grad_storage.scale_(scale_factor) - - self._reset_params_checked_in() diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py index 403f9d5d9a6c1..f2720b04ea093 100644 --- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py +++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py @@ -12,13 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. import itertools +import os from collections import OrderedDict import numpy as np import paddle +from paddle.framework import base as imperative_base from paddle.framework import core + +class HOOK_ACTION: + ALL_REDUCE = 0 + REDUCE = 1 + + alignment = { "gpu": 256, } @@ -101,23 +109,204 @@ def flatten_dense_tensors( return grad_storage -def obtain_storage(parameters, use_main_grad, clip, dist): +def bw_hook_func(buffer, param): + @paddle.autograd.no_grad() + def fused_comm(*_): + buffer.add_grad(param) + + return fused_comm + + +class FusedCommBuffer: + def __init__( + self, + id, + params, + comm_group, + acc_steps=1, + act=None, + dst=-1, + use_main_grad=None, + fuse_param=False, + scale_after_comm=True, + ): + self._id = id + self._params = params + self._acc_steps = acc_steps + self._comm_group = comm_group + self._scale_after_comm = scale_after_comm + self._fuse_param = fuse_param + + self.use_main_grad = ( + use_main_grad + if use_main_grad is not None + else hasattr(self._params[0], "main_grad") + ) + + self._task = None + self._params_step_dict = {} + self._params_checked_in = 0 + self._grads_to_addr = {} + + self._act = act + if self._act == HOOK_ACTION.ALL_REDUCE: + assert dst == -1 + elif self._act == HOOK_ACTION.REDUCE: + assert dst != -1 + else: + raise ValueError( + "The act should be allreudce for dp or reduce for sharding." + ) + self._dst = dst + + self._init_step_dict() + + if self._fuse_param: + self.param_storage, self.grad_storage = flatten_dense_tensors( + self._params, + use_main_grad=use_main_grad, + fuse_param=True, + warp_buffer=True, + ) + self.param_storage = self.param_storage.buffer + self.grad_storage = self.grad_storage.buffer + else: + self.param_storage = None + self.grad_storage = flatten_dense_tensors( + self._params, + use_main_grad=self.use_main_grad, + fuse_param=False, + warp_buffer=False, + ).buffer + + self._record_addr() + + def _record_addr(self): + for param in self._params: + addr = ( + param.main_grad.data_ptr() + if self.use_main_grad + else param.grad.data_ptr() + ) + self._grads_to_addr[param.name] = addr + + def _init_step_dict(self): + for p in self._params: + self._params_step_dict[p.name] = 0 + + def _reset_params_checked_in(self): + self._task = None + self._init_step_dict() + self._params_checked_in = 0 + + @property + def _all_params_checked_in(self): + return ( + len(self._params) == self._params_checked_in + and len(self._params_step_dict) == 0 + ) + + def add_grad(self, param): + assert param.name in self._params_step_dict + current_ptr = ( + param.main_grad.data_ptr() + if self.use_main_grad + else param.grad.data_ptr() + ) + if self._grads_to_addr[param.name] != current_ptr: + raise ValueError( + "The address of the grad/main_grad of the param has been changed during training, " + "which is not allowed for dp/sharding overlap with pp. " + "This may be caused by some non-inplace operations on the grad/main_grad. " + "Please use the inplace version of the operations or disable the overlapping." + ) + + self._params_step_dict[param.name] += 1 + + if self._params_step_dict[param.name] == self._acc_steps: + self._params_checked_in += 1 + self._params_step_dict.pop(param.name) + + if self._all_params_checked_in: + self._comm_grads() + + @imperative_base.no_grad + def _comm_grads(self): + assert self._all_params_checked_in + + if not self._scale_after_comm: + scale_factor = 1.0 / self._comm_group.nranks + self.grad_storage.scale_(scale_factor) + + if self._act == HOOK_ACTION.ALL_REDUCE: + task = paddle.distributed.all_reduce( + self.grad_storage, group=self._comm_group, sync_op=False + ) + + elif self._act == HOOK_ACTION.REDUCE: + task = paddle.distributed.reduce( + self.grad_storage, + dst=self._dst, + group=self._comm_group, + sync_op=False, + ) + + self._task = task + + @imperative_base.no_grad + def scale_grads(self): + assert self._task is not None + self._task.wait() + + if self._scale_after_comm: + scale_factor = 1.0 / self._comm_group.nranks + self.grad_storage.scale_(scale_factor) + + self._reset_params_checked_in() + + +def obtain_storage( + parameters, + use_main_grad=False, + clip=True, + dist=False, + fuse_param=True, + comm_overlap=False, + act=None, + comm_group=None, + dst=-1, + acc_steps=1, + scale_after_comm=False, +): if len(parameters) < 1: - return [] + return [], [] var_groups = assign_group_by_size(parameters, group_size=256 * 1024 * 1024) storage = [] + buffers = [] for group_idx, parameters in var_groups.items(): - param_storage, grad_storage = flatten_dense_tensors( + comm_buffer = FusedCommBuffer( + group_idx, parameters, + comm_group=comm_group, + acc_steps=acc_steps, + act=act, + dst=dst, use_main_grad=use_main_grad, - fuse_param=True, - warp_buffer=True, + fuse_param=fuse_param, + scale_after_comm=scale_after_comm, ) - param_storage.buffer.need_clip = clip - param_storage.buffer.is_distributed = dist - storage.append(param_storage.buffer) - return storage + if fuse_param: + param_buffer = comm_buffer.param_storage + param_buffer.need_clip = clip + param_buffer.is_distributed = dist + storage.append(param_buffer) + if comm_overlap: + for param in parameters: + param._register_backward_hook(bw_hook_func(comm_buffer, param)) + buffers.append(comm_buffer) + + return storage, buffers def filter_params(params, is_fp32, is_distributed, need_clip): @@ -155,7 +344,38 @@ def filter_params(params, is_fp32, is_distributed, need_clip): return params, dtype -def fused_parameters(parameters, use_main_grad): +def fused_parameters( + parameters, + use_main_grad=False, + fuse_param=True, + comm_overlap=False, + comm_group=None, + dst=-1, + acc_step=1, + scale_after_comm=False, +): + """ + Fuse gradients. Fuse parameters if be enabled. Prepare for comm overlap if be enabled. + :param parameters: all parameters to be fused. + :param use_main_grad: does the gradient use main grad or not + :param comm_overlap: enable comm overlap or not + :param comm_group: the comm group for comm overlap + :param dst: the dst for comm overlap + :param acc_step: acc steps, using for comm overlap + :param fuse_param: fuse param or not + :param scale_after_comm: if enable comm overlap, specify the location of grad scale + :return: param storage if fused, comm buffers is comm overlap + """ + g_shard_use_reduce = int(os.environ.get("FLAGS_shard_use_reduce", 0)) + act = ( + HOOK_ACTION.ALL_REDUCE if not g_shard_use_reduce else HOOK_ACTION.REDUCE + ) + if comm_overlap: + assert comm_group is not None + if act == HOOK_ACTION.REDUCE: + assert dst != -1 + elif act == HOOK_ACTION.ALL_REDUCE: + dst = -1 param_groups = [] attrs = [] @@ -178,6 +398,7 @@ def fused_parameters(parameters, use_main_grad): decay_fused = [] all_fused = [] + all_buffers = [] for params, attr in zip(param_groups, attrs): decay_params = [] other_params = [] @@ -190,14 +411,36 @@ def fused_parameters(parameters, use_main_grad): is_distributed = attr[1] need_clip = attr[2] - decay = obtain_storage( - decay_params, use_main_grad, need_clip, is_distributed + decay, decay_buffers = obtain_storage( + decay_params, + use_main_grad=use_main_grad, + clip=need_clip, + dist=is_distributed, + fuse_param=fuse_param, + comm_overlap=comm_overlap, + act=act, + comm_group=comm_group, + dst=dst, + acc_steps=acc_step, + scale_after_comm=scale_after_comm, ) - other = obtain_storage( - other_params, use_main_grad, need_clip, is_distributed + other, other_buffers = obtain_storage( + other_params, + fuse_param=fuse_param, + comm_overlap=comm_overlap, + use_main_grad=use_main_grad, + clip=need_clip, + dist=is_distributed, + act=act, + comm_group=comm_group, + dst=dst, + acc_steps=acc_step, + scale_after_comm=scale_after_comm, ) decay_fused += decay all_fused += decay all_fused += other + all_buffers += decay_buffers + all_buffers += other_buffers - return decay_fused, all_fused + return decay_fused, all_fused, all_buffers diff --git a/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py b/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py index 310313119b4c3..e70656a4ce608 100644 --- a/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py +++ b/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py @@ -99,6 +99,8 @@ def setUp(self): "pp_degree": 1, } self.strategy.hybrid_configs["sharding_configs"].tensor_fusion = True + self.strategy.hybrid_configs["sharding_configs"].comm_overlap = True + self.strategy.hybrid_configs["sharding_configs"].accumulate_steps = 1 fleet.init(is_collective=True, strategy=self.strategy) self.data = np.random.randint( 0, diff --git a/test/legacy_test/test_fused_comm_buffer.py b/test/legacy_test/test_fused_comm_buffer.py index ad771b6dfe5a2..25d9a2748bd0e 100644 --- a/test/legacy_test/test_fused_comm_buffer.py +++ b/test/legacy_test/test_fused_comm_buffer.py @@ -15,7 +15,7 @@ import unittest import paddle -from paddle.distributed.fleet.meta_parallel.pp_utils.utils import ( +from paddle.distributed.fleet.utils.tensor_fusion_helper import ( HOOK_ACTION, FusedCommBuffer, ) From f6161d1e180d76d099c59ab6d13663c2b86bbbc7 Mon Sep 17 00:00:00 2001 From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com> Date: Mon, 24 Jul 2023 16:56:24 +0800 Subject: [PATCH 13/34] [Semi-Auto] Add transpose spmd rule (#55350) * [Semi-Auto] Add transpose spmd rule * add unit test in cmake file * log perm info --- .../auto_parallel/spmd_rules/common.h | 2 +- .../auto_parallel/spmd_rules/rules.h | 4 + .../spmd_rules/transpose_spmd_rule.cc | 103 ++++++++++++ .../spmd_rules/transpose_spmd_rule.h | 40 +++++ test/auto_parallel/spmd_rules/CMakeLists.txt | 1 + .../spmd_rules/test_transpose_rule.py | 154 ++++++++++++++++++ 6 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h create mode 100644 test/auto_parallel/spmd_rules/test_transpose_rule.py diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h index 35fb67938ee22..f5a49ab0a9f18 100644 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h @@ -77,7 +77,7 @@ class SPMDRuleBase { PADDLE_ENFORCE_NE(iter, attrs.end(), paddle::platform::errors::NotFound( - "(%s) is not found in AttributeMap.")); + "(%s) is not found in AttributeMap.", name)); return iter->second; } }; diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h index bba4339198021..713a52770926d 100644 --- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h @@ -24,6 +24,7 @@ #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h" #include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h" +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h" // TODO(ljz) Automatic this process in cmake file. namespace paddle { @@ -155,6 +156,9 @@ REGISTER_SPMD_RULE(softmax_with_cross_entropy, CrossEntropyWithSoftmaxSPMDRule); REGISTER_SPMD_RULE(split, SplitSPMDRule); REGISTER_SPMD_RULE(split_with_num, SplitSPMDRule); +// transpose rule +REGISTER_SPMD_RULE(transpose, TransposeSPMDRule); + } // namespace auto_parallel } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc new file mode 100644 index 0000000000000..fe567e70fa019 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h" +#include "paddle/phi/core/distributed/auto_parallel/utils.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { +using phi::distributed::auto_parallel::str_join; +std::pair, std::vector> +TransposeSPMDRule::InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) { + // step0: Verify Input Args Based on Transpose Logic + int64_t ninputs = input_specs.size(); + PADDLE_ENFORCE_EQ( + ninputs, + 1, + phi::errors::InvalidArgument("The size of InputSpec in transpose must " + "be equal to 1, but got [%d].", + ninputs)); + VerifySpecs(input_specs, "transpose"); + + // step1: Build Einsum Notation + std::vector perm_dims = + ExtractAttr>("perm", attrs); + std::string alphabet = "abcdefghijklmnopqrstuvwxyz"; + + // get einsum notation for input + int64_t ndim = input_specs[0].shape().size(); + std::vector input_axes_vec; + std::string input_axes = alphabet.substr(0, ndim); + input_axes_vec.emplace_back(input_axes); + + // get einsum notation for output + for (int64_t i = 0, n = perm_dims.size(); i < n; ++i) { + // convert the negative dim value to normal dim value + if (perm_dims[i] < 0) { + perm_dims[i] = ndim + perm_dims[i]; + } + } + std::string output_axes = ""; + for (int64_t i = 0; i < ndim; i++) { + output_axes.append(1, input_axes[perm_dims[i]]); + } + + // step2: Sharding Propogation + // step2.1: merge input shardings + std::vector>> axes_sharding_info; + axes_sharding_info = GetAxesDimsMappingPair(input_axes_vec, input_specs); + std::unordered_map axis_to_dim_map = + ShardingMergeForTensors(axes_sharding_info); + + // step2.2: infer output dimsmapping from merged input dimsmapping + std::vector output_dims_mapping = + GetDimsMappingForAxes(output_axes, axis_to_dim_map); + + // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with + // input dist_attr. + TensorDistAttr output_dist_attr = + CopyTensorDistAttrForOutput(input_specs[0].dist_attr()); + output_dist_attr.set_dims_mapping(output_dims_mapping); + + // Step2.3 handle input tensor partial (TODO) + VLOG(4) << "TransposeSPMDRule InferForward:"; + for (int64_t i = 0; i < ninputs; i++) { + VLOG(4) << "Input" << std::to_string(i) << " shape: [" + << str_join(input_specs[i].shape()) << "] " + << "src_dims_mapping: [" << str_join(input_specs[i].dims_mapping()) + << "] " + << "perm: [" << str_join(perm_dims) << "] " + << "dst_dims_mapping: [" << str_join(input_specs[i].dims_mapping()) + << "]"; + } + VLOG(4) << "Output dims_mapping: [" + str_join(output_dims_mapping) + "]\n\n"; + + return {{input_specs[0].dist_attr()}, {output_dist_attr}}; +} + +std::pair, std::vector> +TransposeSPMDRule::InferBackward( + const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) { + PADDLE_THROW(phi::errors::Unimplemented( + "InferBackward of TransposeSPMDRule is NOT implemented yet.")); + + return {}; +} + +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h new file mode 100644 index 0000000000000..b047932036a71 --- /dev/null +++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h" + +namespace paddle { +namespace distributed { +namespace auto_parallel { + +class TransposeSPMDRule : public SPMDRuleBase { + public: + std::pair, std::vector> + InferForward(const std::vector& input_specs, + const paddle::framework::AttributeMap& attrs) override; + + std::pair, std::vector> + InferBackward(const std::vector& output_specs, + const paddle::framework::AttributeMap& attrs) override; +}; +} // namespace auto_parallel +} // namespace distributed +} // namespace paddle diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt index 1da9d4674c381..43afd9aed75e7 100644 --- a/test/auto_parallel/spmd_rules/CMakeLists.txt +++ b/test/auto_parallel/spmd_rules/CMakeLists.txt @@ -9,6 +9,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_matmul_rule MODULES test_replicated_rule) py_test_modules(test_matmul_rule MODULES test_softmax_rule) py_test_modules(test_split_rule MODULES test_split_rule) + py_test_modules(test_transpose_rule MODULES test_transpose_rule) # End of unittests WITH single card WITHOUT timeout endif() diff --git a/test/auto_parallel/spmd_rules/test_transpose_rule.py b/test/auto_parallel/spmd_rules/test_transpose_rule.py new file mode 100644 index 0000000000000..62c86c3cf3f38 --- /dev/null +++ b/test/auto_parallel/spmd_rules/test_transpose_rule.py @@ -0,0 +1,154 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.static.completion import get_spmd_rule +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.distributed.fleet import auto + + +class TestTransposeSPMDRule(unittest.TestCase): + """ + Unit tests for reduction spmd rule. + """ + + def setUp(self): + self.rule = get_spmd_rule("transpose") + + x_shape = [64, 36] + process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3]) + + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [1, 0] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + + self.attrs = { + 'perm': [0, 1, 2, 3], + } + + def test_single_mesh_dim(self): + # perm = [1, 0] + # [0, -1] --> [0, -1], [-1, 0] + self.attrs['perm'] = [1, 0] + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 1) + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0]) + + # perm = [0, 1] + # [0, -1] --> [0, -1], [0, -1] + self.attrs['perm'] = [0, 1] + self.x_dist_tensor_spec.set_dims_mapping([0, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1]) + self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1]) + + # perm = [0, 2, 3, 1] + # [-1, -1, 0, -1] --> [-1, -1, 0, -1], [-1, 0, -1, -1] + self.x_dist_tensor_spec.shape = [64, 48, 36, 24] + self.attrs['perm'] = [0, 2, 3, 1] + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, 0, -1, -1] + ) + + def test_multi_mesh_dim(self): + process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]]) + self.x_dist_tensor_spec.set_process_mesh(process_mesh) + self.x_dist_tensor_spec.shape = [64, 48, 36, 24] + + # perm = [0, 2, 3, 1] + # [-1, 0, 1, -1] --> [-1, 0, 1, -1], [-1, 1, -1, 0] + self.attrs['perm'] = [0, 2, 3, 1] + self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(infered_input_dist_attrs), 1) + self.assertEqual(len(infered_output_dist_attrs), 1) + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, 0, 1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, 1, -1, 0] + ) + + # perm = [0, 2, 3, 1] + # [-1, -1, -1, -1] --> [-1, -1, -1, -1], [-1, -1, -1, -1] + self.attrs['perm'] = [0, 2, 3, 1] + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, -1, -1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1, -1] + ) + + # perm = [-1, 0, -2, 1] + # [-1, -1, 0, 1] --> [-1, -1, 0, 1], [1, -1, 0, -1] + self.attrs['perm'] = [-1, 0, -2, 1] + self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1]) + result_dist_attrs = self.rule.infer_forward( + [self.x_dist_tensor_spec], self.attrs + ) + infered_input_dist_attrs = result_dist_attrs[0] + infered_output_dist_attrs = result_dist_attrs[1] + + self.assertEqual( + infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0, 1] + ) + self.assertEqual( + infered_output_dist_attrs[0].dims_mapping, [1, -1, 0, -1] + ) + + +if __name__ == "__main__": + unittest.main() From 0f0dfe9aee46a042c3af05c3cade3af881ff2ba9 Mon Sep 17 00:00:00 2001 From: Windfarer Date: Mon, 24 Jul 2023 17:54:05 +0800 Subject: [PATCH 14/34] [Bug Fix] convert environment variables' types (#55586) --- .../distributed/launch/context/__init__.py | 5 +- .../distributed/launch/context/args_envs.py | 48 +++++++++---------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py index b252e966021bc..9083347a51158 100644 --- a/python/paddle/distributed/launch/context/__init__.py +++ b/python/paddle/distributed/launch/context/__init__.py @@ -97,8 +97,9 @@ def continous_log(self) -> bool: def set_env_in_args(self): for k, v in env_args_mapping.items(): + attr, attr_type = v if k in self.envs: print( - f"LAUNCH WARNNING args {v} is override by env {self.envs[k]}" + f"LAUNCH WARNNING args {attr} will be overridden by env: {k} value: {self.envs[k]}" ) - setattr(self.args, v, self.envs[k]) + setattr(self.args, attr, attr_type(self.envs[k])) diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py index 7dc410de3450d..56eac96f1b8b9 100644 --- a/python/paddle/distributed/launch/context/args_envs.py +++ b/python/paddle/distributed/launch/context/args_envs.py @@ -17,30 +17,30 @@ from distutils.util import strtobool env_args_mapping = { - 'POD_IP': 'host', - 'PADDLE_MASTER': 'master', - 'PADDLE_DEVICES': 'devices', - 'PADDLE_NNODES': 'nnodes', - 'PADDLE_RUN_MODE': 'run_mode', - 'PADDLE_LOG_LEVEL': 'log_level', - 'PADDLE_LOG_OVERWRITE': 'log_overwrite', - 'PADDLE_SORT_IP': 'sort_ip', - 'PADDLE_NPROC_PER_NODE': 'nproc_per_node', - 'PADDLE_JOB_ID': 'job_id', - 'PADDLE_RANK': 'rank', - 'PADDLE_LOG_DIR': 'log_dir', - 'PADDLE_MAX_RESTART': 'max_restart', - 'PADDLE_ELASTIC_LEVEL': 'elastic_level', - 'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout', - 'PADDLE_SERVER_NUM': 'server_num', - 'PADDLE_TRAINER_NUM': 'trainer_num', - 'PADDLE_SERVERS_ENDPOINTS': 'servers', - 'PADDLE_TRAINERS_ENDPOINTS': 'trainers', - 'PADDLE_GLOO_PORT': 'gloo_port', - 'PADDLE_WITH_GLOO': 'with_gloo', - 'PADDLE_START_PORT': 'start_port', - 'PADDLE_IPS': 'ips', - "PADDLE_AUTO_PARALLEL_CONFIG": 'auto_parallel_config', + 'POD_IP': ('host', str), + 'PADDLE_MASTER': ('master', str), + 'PADDLE_DEVICES': ('devices', str), + 'PADDLE_NNODES': ('nnodes', str), + 'PADDLE_RUN_MODE': ('run_mode', str), + 'PADDLE_LOG_LEVEL': ('log_level', str), + 'PADDLE_LOG_OVERWRITE': ('log_overwrite', strtobool), + 'PADDLE_SORT_IP': ('sort_ip', strtobool), + 'PADDLE_NPROC_PER_NODE': ('nproc_per_node', int), + 'PADDLE_JOB_ID': ('job_id', str), + 'PADDLE_RANK': ('rank', int), + 'PADDLE_LOG_DIR': ('log_dir', str), + 'PADDLE_MAX_RESTART': ('max_restart', int), + 'PADDLE_ELASTIC_LEVEL': ('elastic_level', int), + 'PADDLE_ELASTIC_TIMEOUT': ('elastic_timeout', int), + 'PADDLE_SERVER_NUM': ('server_num', int), + 'PADDLE_TRAINER_NUM': ('trainer_num', int), + 'PADDLE_SERVERS_ENDPOINTS': ('servers', str), + 'PADDLE_TRAINERS_ENDPOINTS': ('trainers', str), + 'PADDLE_GLOO_PORT': ('gloo_port', int), + 'PADDLE_WITH_GLOO': ('with_gloo', str), + 'PADDLE_START_PORT': ('start_port', int), + 'PADDLE_IPS': ('ips', str), + "PADDLE_AUTO_PARALLEL_CONFIG": ('auto_parallel_config', str), } From 38fbbe6be5fa0a5e6beaebb214636f1c67540d96 Mon Sep 17 00:00:00 2001 From: jjyaoao Date: Mon, 24 Jul 2023 18:18:12 +0800 Subject: [PATCH 15/34] =?UTF-8?q?=E4=BF=AE=E6=94=B9COPY-FROM=20No.13=20dis?= =?UTF-8?q?tributed=20(#55236)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jjyaoao --- .../distributed/fleet/base/role_maker.py | 41 +++++++++++++++++++ .../distributed/fleet/dataset/dataset.py | 8 ++-- python/paddle/distributed/fleet/fleet.py | 29 ++++++++----- 3 files changed, 65 insertions(+), 13 deletions(-) diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 113a0132f4c12..0b09fd082ba41 100755 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -546,6 +546,30 @@ def _barrier(self, comm_world): class PaddleCloudRoleMaker(RoleMakerBase): + + """ + PaddleCloudRoleMaker is an interface for distributed configuration initialization based on obtaining distributed related information from environment variables. + + Examples: + .. code-block:: python + + import os + import paddle.distributed.fleet as fleet + + os.environ["PADDLE_PSERVER_NUMS"] = "2" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_PORT"] = "36001" + os.environ["TRAINING_ROLE"] = "PSERVER" + os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001" + + os.environ["PADDLE_TRAINER_ID"] = "0" + + fleet.PaddleCloudRoleMaker(is_collective=False) + + """ + def __init__(self, is_collective=False, **kwargs): super().__init__() self._is_collective = is_collective @@ -1184,6 +1208,23 @@ def _generate_role(self): class UserDefinedRoleMaker(PaddleCloudRoleMaker): + + """ + UserDefinedRoleMaker is an interface for distributed configuration initialization based on obtaining distributed related information from user-defined parameters. + + Examples: + .. code-block:: python + + import paddle.distributed.fleet as fleet + from paddle.distributed.fleet.base.role_maker import Role + + fleet.UserDefinedRoleMaker( + current_id=0, + role=Role.SERVER, + worker_num=2, + server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) + """ + def __init__(self, is_collective=False, init_gloo=False, **kwargs): super().__init__( is_collective=is_collective, init_gloo=init_gloo, **kwargs diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py index a28d439f7ecd5..8cdd57a34248e 100755 --- a/python/paddle/distributed/fleet/dataset/dataset.py +++ b/python/paddle/distributed/fleet/dataset/dataset.py @@ -1279,10 +1279,11 @@ class QueueDataset(DatasetBase): QueueDataset, it will process data streamly. Examples: - .. code-block:: python - import paddle - dataset = paddle.distributed.QueueDataset() + .. code-block:: python + + import paddle + dataset = paddle.distributed.QueueDataset() """ @@ -1298,6 +1299,7 @@ def init(self, **kwargs): :api_attr: Static Graph should be called only once in user's python scripts to initialize setings of dataset instance + """ super().init(**kwargs) diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py index 860e0a6ce5013..2dab355264b4d 100755 --- a/python/paddle/distributed/fleet/fleet.py +++ b/python/paddle/distributed/fleet/fleet.py @@ -105,10 +105,11 @@ class Fleet: Returns: Fleet: A Fleet instance - Example for collective training: .. code-block:: python + :name: code-example1 + # Example1: for collective training import paddle paddle.enable_static() import paddle.distributed.fleet as fleet @@ -122,10 +123,11 @@ class Fleet: # do distributed training - Example for parameter server training: .. code-block:: python + :name: code-example2 + # Example2: for parameter server training import paddle paddle.enable_static() import paddle.distributed.fleet as fleet @@ -195,40 +197,39 @@ def init( Returns: None - Examples1: + Examples: .. code-block:: python + :name: code-example1 import paddle.distributed.fleet as fleet fleet.init() - Examples2: + .. code-block:: python + :name: code-example2 import paddle.distributed.fleet as fleet fleet.init(is_collective=True) - Examples3: .. code-block:: python - + :name: code-example3 import paddle.distributed.fleet as fleet role = fleet.PaddleCloudRoleMaker() fleet.init(role) - Examples4: .. code-block:: python - + :name: code-example4 import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() fleet.init(strategy=strategy) - Examples5: .. code-block:: python - + :name: code-example5 import paddle.distributed.fleet as fleet strategy = fleet.DistributedStrategy() fleet.init(log_level = "DEBUG") @@ -627,6 +628,14 @@ def barrier_worker(self): Returns: None + + Examples: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + fleet.init() + fleet.barrier_worker() """ self._role_maker._barrier("worker") From 76530a2a061137a365f036f1ac8537fc1eec558e Mon Sep 17 00:00:00 2001 From: JYChen Date: Mon, 24 Jul 2023 18:31:12 +0800 Subject: [PATCH 16/34] add IndexPutGradInfermeta to fix backward error in static-mode (#55602) * add IndexPutGradInfermeta to fix backward error in static-mode * codestyle --- paddle/phi/api/yaml/backward.yaml | 3 +-- paddle/phi/infermeta/backward.cc | 15 ++++++++++++ paddle/phi/infermeta/backward.h | 7 ++++++ test/legacy_test/test_index_put_op.py | 33 +++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 2 deletions(-) diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 465df08392d91..924a5f59d5055 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -1092,8 +1092,7 @@ args : (Tensor x, Tensor[] indices, Tensor value, Tensor out_grad, bool accumulate=false) output : Tensor(x_grad), Tensor(value_grad) infer_meta : - func : GeneralBinaryGradInferMeta - param : [x, value] + func : IndexPutGradInferMeta kernel : func : index_put_grad data_type : out_grad diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index b028fd15b1b93..d1078e2d176bc 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -1202,6 +1202,21 @@ void IndexAddGradInferMeta(const MetaTensor& index, } } +void IndexPutGradInferMeta(const MetaTensor& x, + const std::vector& indices, + const MetaTensor& value, + const MetaTensor& out_grad, + bool accumulate, + MetaTensor* x_grad, + MetaTensor* value_grad) { + if (x_grad) { + x_grad->share_meta(x); + } + if (value_grad) { + value_grad->share_meta(value); + } +} + void FusedRopeGradInferMeta(const MetaTensor& dout_q, const MetaTensor& dout_k, const MetaTensor& dout_v, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index cb923e16446af..c73e5ab7a4d9e 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -467,4 +467,11 @@ void IndexAddGradInferMeta(const MetaTensor& index, MetaTensor* x_grad, MetaTensor* add_tensor_grad); +void IndexPutGradInferMeta(const MetaTensor& x, + const std::vector& indices, + const MetaTensor& value, + const MetaTensor& out_grad, + bool accumulate, + MetaTensor* x_grad, + MetaTensor* value_grad); } // namespace phi diff --git a/test/legacy_test/test_index_put_op.py b/test/legacy_test/test_index_put_op.py index c4bf5d6f0fd40..f21f7b084bde4 100644 --- a/test/legacy_test/test_index_put_op.py +++ b/test/legacy_test/test_index_put_op.py @@ -854,6 +854,39 @@ def test_backwardBroadCastValue2(self): atol=1e-7, ) + def test_backward_in_static(self): + paddle.enable_static() + exe = paddle.static.Executor() + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(train_program, startup_program): + x = paddle.zeros((4, 2, 5)) + x.stop_gradient = False + + y = x + 1 + index = paddle.to_tensor([0, 1, 3]) + + value = paddle.ones((5,)) + value.stop_gradient = False + + z = paddle.index_put(y, (index,), value) + l = z.sum() + paddle.static.append_backward(l) + res = exe.run(fetch_list=[z, x.grad_name, value.grad_name]) + + expected_z = np.ones((4, 2, 5)) + expected_z[[0, 1, 3]] = np.ones((5,)) + + expected_x_grad = np.ones((4, 2, 5)) + expected_x_grad[[0, 1, 3]] = 0 + + expected_v_grad = np.ones((5,)) * 3 * 2 + + np.testing.assert_allclose(expected_z, res[0]) + np.testing.assert_allclose(expected_x_grad, res[1]) + np.testing.assert_allclose(expected_v_grad, res[2]) + paddle.disable_static() + class TestIndexPutAPIMixedIndices(TestIndexPutAPIBase): def init_dtype_type(self): From a3cf25e302acd8b81a4bdb83e4606efef2208470 Mon Sep 17 00:00:00 2001 From: chen <103103266+ckl117@users.noreply.github.com> Date: Mon, 24 Jul 2023 19:39:56 +0800 Subject: [PATCH 17/34] [Paddle-TRT] Convert 0D tensor to 1D tensor, increase the shape tensor's number count when collecting shape (#55503) * make 0-D tensor to 1-D tensor to support Grounding-SAM and add shape check * recover identity_op_clean_pass.cc --- .../fluid/inference/api/analysis_predictor.cc | 4 +-- .../tensorrt/convert/elementwise_op.cc | 2 +- .../inference/tensorrt/convert/op_converter.h | 29 +++++++++++++++++-- paddle/fluid/inference/tensorrt/engine.cc | 5 ++++ paddle/fluid/inference/tensorrt/engine.h | 4 +++ .../operators/tensorrt/tensorrt_engine_op.h | 12 ++++++++ 6 files changed, 50 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 44bd09e86ada9..e1c8410b9db35 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2236,10 +2236,10 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() { // We need collect value range for shape tensor for Paddle-TRT's use. // To be noticed, this method to identify all shape tensors is based on - // assumption that all shape tensors in the model have numbers <= 7. + // assumption that all shape tensors in the model have numbers <= 8. // This is a simple method to identify all shape tensors with some // mistakes, but it doesn't matter. - auto is_shape_tensor = tensor.numel() <= 7 && tensor.numel() >= 1; + auto is_shape_tensor = tensor.numel() <= 8 && tensor.numel() >= 1; if ((tensor.dtype() == phi::DataType::INT32 || tensor.dtype() == phi::DataType::INT64) && is_shape_tensor) { diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 252b57551e526..81e175fac5ea2 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -30,7 +30,7 @@ class ElementwiseTensorOpConverter : public OpConverter { auto* X = engine_->GetITensor(op_desc.Input("X").front()); nvinfer1::ITensor* Y = nullptr; auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); - if (Y_v) { + if (Y_v && !engine_->with_dynamic_shape()) { // Y is weight auto* Y_t = Y_v->GetMutable(); std::vector dims_y = phi::vectorize(Y_t->dims()); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index e2991a2c708d8..b07b20bb82121 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -308,9 +308,15 @@ class OpConverter { auto var_shape = var->GetShape(); if (engine->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) - auto min_input_shape = engine->min_input_shape()[input]; - auto max_input_shape = engine->max_input_shape()[input]; - auto optim_input_shape = engine->optim_input_shape()[input]; + if (!(engine->min_input_shape().count(input) && + engine->max_input_shape().count(input) && + engine->optim_input_shape().count(input))) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Cannot get %s min/max/opt shape", input)); + } + auto min_input_shape = engine->min_input_shape().at(input); + auto max_input_shape = engine->max_input_shape().at(input); + auto optim_input_shape = engine->optim_input_shape().at(input); size_t ranks = min_input_shape.size(); std::vector input_shape; @@ -732,6 +738,23 @@ class OpConverter { layer_name += output_tensor_names[i]; if (i != num_out - 1) layer_name += ", "; } + for (size_t i = 0; i < num_out; i++) { + nvinfer1::Dims tmp_dims = layer->getOutput(i)->getDimensions(); + std::vector tmp_vec; + for (int i = 0; i < tmp_dims.nbDims; i++) + tmp_vec.push_back(tmp_dims.d[i]); + + VLOG(3) << output_tensor_names[i] << "'s dimension :[" + << string::join_strings(tmp_vec, ',') << "]"; + // The following check may cause errors in CI, but is necessary in the + // latest version. + // PADDLE_ENFORCE_GE( + // layer->getOutput(i)->getDimensions().nbDims, + // 0, + // platform::errors::InvalidArgument( + // "Error occures in Paddle-TRT layer with output name: %s", + // output_tensor_names[i].c_str())); + } layer->setName((layer_name + ")").c_str()); } void SetEngine(TensorRTEngine* engine) { engine_ = engine; } diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 62b7957332508..6e9beb810f0bd 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -532,6 +532,11 @@ nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor( for (int64_t i = 0; i < trt_in_shape.nbDims; i++) { trt_in_shape.d[i] = var_dims[i]; } + // Make 0-D tensor to 1-D tensor. + if (trt_in_shape.nbDims == 0) { + trt_in_shape.nbDims = 1; + trt_in_shape.d[0] = 1; + } // In fact , this is not always right, because we can't determine if the 0th // dimension is batch. Just for run chenqu's model if (!this->with_dynamic_shape()) { diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 1f69bbfba097b..303778b588968 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -524,6 +524,10 @@ class TensorRTEngine { for (const auto& it : runtime_input_shape) { auto name = it.first; auto input_shape = it.second; + // Make 0-D tensor to 1-D tensor. + if (input_shape.size() == 0) { + input_shape.push_back(1); + } bool min_change = false; bool max_change = false; std::vector bak_min_shape; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index d6679ea26f8f7..f05ffe620c73c 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -594,6 +594,18 @@ class TensorRTEngineOp : public framework::OperatorBase { t.ShareDataWith(out); } auto t_shape = phi::vectorize(t.dims()); + + // This must be a zero dimension tensor. + // At present, we convert it to a 1D tensor to feed them into Trt. + if (t_shape.size() == 0) { + PADDLE_ENFORCE_EQ( + t.numel(), + 1UL, + platform::errors::PreconditionNotMet( + "This tensor must have one element, but got %ld.", t.numel())); + t_shape.push_back(1); + } + // Get index of profile 0 first, then plus binding offset const int bind_index = engine->engine()->getBindingIndex(x.c_str()) + binding_offset; From ae2d8ba157540b39a4d7ab897c030217a33e82cb Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 24 Jul 2023 21:01:05 +0800 Subject: [PATCH 18/34] [AutoParallel] Simplify DistTensor namespace path (#55593) * simplify dist tensor namespace path * fix tensor dist attr decl error --- paddle/fluid/pybind/eager.cc | 2 +- paddle/fluid/pybind/eager_method.cc | 4 ++-- paddle/fluid/pybind/eager_properties.cc | 5 ++--- paddle/fluid/pybind/eager_utils.cc | 2 +- paddle/fluid/pybind/eager_utils.h | 2 +- paddle/fluid/pybind/tensor.cc | 2 +- paddle/phi/api/lib/tensor.cc | 2 +- paddle/phi/core/dense_tensor.h | 4 +--- paddle/phi/core/distributed/auto_parallel/dist_tensor.cc | 2 -- paddle/phi/core/distributed/auto_parallel/dist_tensor.h | 6 +++--- paddle/phi/core/utils/type_info.cc | 3 +-- 11 files changed, 14 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 316a19728c19b..42d8911bdafa0 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -45,7 +45,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_DISTRIBUTE #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h" -using phi::distributed::auto_parallel::DistTensor; +using phi::distributed::DistTensor; using phi::distributed::auto_parallel::TensorDistAttr; #endif diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 6a3f7e09c202a..6d07363805f8f 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -801,8 +801,8 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self, return ToPyObject(tensor); } else if (self->tensor.is_dist_tensor()) { #ifdef PADDLE_WITH_DISTRIBUTE - auto* tensor = static_cast( - self->tensor.impl().get()); + auto* tensor = + static_cast(self->tensor.impl().get()); VLOG(6) << "dist tensor: " << tensor->defined(); return ToPyObject(tensor); #else diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 42c5b97067b0e..42d53ad7bee01 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -164,9 +164,8 @@ PyObject* tensor_properties_get_dist_attr(TensorObject* self, void* closure) { EAGER_TRY if (self->tensor.is_dist_tensor()) { #ifdef PADDLE_WITH_DISTRIBUTE - phi::distributed::auto_parallel::DistTensor* dist_tensor = - static_cast( - self->tensor.impl().get()); + phi::distributed::DistTensor* dist_tensor = + static_cast(self->tensor.impl().get()); return ToPyObject(dist_tensor->dist_attr().get()); #else RETURN_PY_NONE diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index e365819928e66..8dfc7cfc8e426 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -859,7 +859,7 @@ PyObject* ToPyObject(const phi::DenseTensor* value) { } #ifdef PADDLE_WITH_DISTRIBUTE -PyObject* ToPyObject(const phi::distributed::auto_parallel::DistTensor* value) { +PyObject* ToPyObject(const phi::distributed::DistTensor* value) { auto obj = ::pybind11::cast(value, py::return_value_policy::reference); obj.inc_ref(); return obj.ptr(); diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 208d2f25e7d21..1fb53a3b9f7a6 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -113,7 +113,7 @@ PyObject* ToPyObject(const std::vector>& value, PyObject* ToPyObject(const platform::Place& value); PyObject* ToPyObject(const phi::DenseTensor* value); #ifdef PADDLE_WITH_DISTRIBUTE -PyObject* ToPyObject(const phi::distributed::auto_parallel::DistTensor* value); +PyObject* ToPyObject(const phi::distributed::DistTensor* value); PyObject* ToPyObject( const phi::distributed::auto_parallel::TensorDistAttr* value); #endif diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc index 98ae45dd0134b..e9ad190ea3f3c 100644 --- a/paddle/fluid/pybind/tensor.cc +++ b/paddle/fluid/pybind/tensor.cc @@ -1025,7 +1025,7 @@ void BindTensor(pybind11::module &m) { // NOLINT #endif #ifdef PADDLE_WITH_DISTRIBUTE - using phi::distributed::auto_parallel::DistTensor; + using phi::distributed::DistTensor; py::class_(m, "DistTensor") .def( "get_tensor", diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 40319fa9ba660..b835230e22978 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -133,7 +133,7 @@ bool Tensor::is_dense_tensor() const { } bool Tensor::is_dist_tensor() const { #ifdef PADDLE_WITH_DISTRIBUTE - return phi::distributed::auto_parallel::DistTensor::classof(impl_.get()); + return phi::distributed::DistTensor::classof(impl_.get()); #else return false; #endif diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index 2cfdd7493c438..8af8f745baff7 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -30,9 +30,7 @@ namespace phi { class DenseTensorUtils; namespace distributed { -namespace auto_parallel { class DistTensor; -} // namespace auto_parallel } // namespace distributed /// \brief The Dense tensor stores values in a contiguous sequential block @@ -186,7 +184,7 @@ class DenseTensor : public TensorBase, private: friend class DenseTensorUtils; - friend class phi::distributed::auto_parallel::DistTensor; + friend class phi::distributed::DistTensor; protected: DenseTensorMeta meta_; diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc index 6f60773132656..b234fc3c17485 100644 --- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc +++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc @@ -16,7 +16,6 @@ namespace phi { namespace distributed { -namespace auto_parallel { void* DistTensor::AllocateFrom(Allocator* allocator, DataType dtype, @@ -59,6 +58,5 @@ void DistTensor::set_meta(const DenseTensorMeta& meta) { meta_ = meta; } -} // namespace auto_parallel } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h index ed47727fe9a3a..eb3a6dbbe3e66 100644 --- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h +++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h @@ -18,11 +18,12 @@ #include "paddle/phi/core/dense_tensor.h" namespace phi { - namespace distributed { -namespace auto_parallel { +namespace auto_parallel { class TensorDistAttr; +} +using auto_parallel::TensorDistAttr; class DistTensor final : public phi::TensorBase, @@ -125,6 +126,5 @@ class DistTensor final std::unique_ptr value_{nullptr}; }; -} // namespace auto_parallel } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc index 2a554525024c8..0de0289887507 100644 --- a/paddle/phi/core/utils/type_info.cc +++ b/paddle/phi/core/utils/type_info.cc @@ -56,8 +56,7 @@ template class TypeInfoTraits; template class TypeInfoTraits; #ifdef PADDLE_WITH_DISTRIBUTE -template class TypeInfoTraits; +template class TypeInfoTraits; #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ From a7567cd0e0b9541075bbdccb5dc0a13bca024dbc Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Tue, 25 Jul 2023 09:59:42 +0800 Subject: [PATCH 19/34] add vjp interface (#55660) --- paddle/fluid/ir/interface/interface.cc | 2 + paddle/fluid/ir/interface/vjp.h | 59 ++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 paddle/fluid/ir/interface/vjp.h diff --git a/paddle/fluid/ir/interface/interface.cc b/paddle/fluid/ir/interface/interface.cc index 442be02e2f235..ce43e44782867 100644 --- a/paddle/fluid/ir/interface/interface.cc +++ b/paddle/fluid/ir/interface/interface.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/ir/interface/infermeta.h" #include "paddle/fluid/ir/interface/op_yaml_info.h" +#include "paddle/fluid/ir/interface/vjp.h" IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::InferMetaInterface) IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OpYamlInfoInterface) +IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::VjpInterface) diff --git a/paddle/fluid/ir/interface/vjp.h b/paddle/fluid/ir/interface/vjp.h new file mode 100644 index 0000000000000..dec58f54af7e2 --- /dev/null +++ b/paddle/fluid/ir/interface/vjp.h @@ -0,0 +1,59 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/ir/core/op_base.h" + +namespace paddle { +namespace dialect { +class VjpInterface : public ir::OpInterfaceBase { + public: + struct Concept { + explicit Concept(std::vector> (*vjp)( + std::vector> out_grads, + const std::vector>& stop_gradients)) + : vjp_(vjp) {} + std::vector> (*vjp_)( + std::vector> out_grads, + const std::vector>& stop_gradients); + }; + + template + struct Model : public Concept { + static std::vector> Vjp( + std::vector> out_grads, + const std::vector>& stop_gradients) { + return ConcreteOp::Vjp(out_grads, stop_gradients); + } + + Model() : Concept(Vjp) {} + }; + + VjpInterface(ir::Operation* op, Concept* impl) + : ir::OpInterfaceBase(op), impl_(impl) {} + + std::vector> Vjp( + std::vector> out_grads, + const std::vector>& stop_gradients) { + return impl_->vjp_(out_grads, stop_gradients); + } + + private: + Concept* impl_; +}; + +} // namespace dialect +} // namespace paddle + +IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::VjpInterface) From 14094aad5c663d0bd387ef36c33173bc6faa8da6 Mon Sep 17 00:00:00 2001 From: jiangfan06 <117341294+MuShangCC@users.noreply.github.com> Date: Tue, 25 Jul 2023 10:22:34 +0800 Subject: [PATCH 20/34] [XPU] Add FP16 support for arg_min_max (#55642) --- paddle/phi/backends/xpu/xpu2_op_list.cc | 5 ++++- paddle/phi/kernels/xpu/arg_min_max_kernel.cc | 13 ++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index c146108917eeb..acbcc81b12fb2 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -36,7 +36,10 @@ XPUOpMap& get_kl2_ops() { {"adam_dense_param_sparse_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})}, - {"arg_max", XPUKernelSet({phi::DataType::FLOAT32})}, + {"arg_max", + XPUKernelSet({phi::DataType::INT32, + phi::DataType::FLOAT32, + phi::DataType::FLOAT16})}, {"argsort_grad", XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64, diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc index 39f04df6139fa..2b637e9da09e8 100644 --- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc +++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc @@ -35,6 +35,7 @@ void ArgMaxKernel(const Context& dev_ctx, bool flatten, int dtype, DenseTensor* out) { + using XPUType = typename XPUTypeTrait::Type; PADDLE_ENFORCE_EQ( (dtype < 0 || dtype == ARG_MAX_OUTPUT_DATATYPE_INT32 || dtype == ARG_MAX_OUTPUT_DATATYPE_INT64), @@ -69,7 +70,7 @@ void ArgMaxKernel(const Context& dev_ctx, return; } r = xpu::argmax(dev_ctx.x_context(), - x.data(), + reinterpret_cast(x.data()), out->data(), xdims_vec, axis_val); @@ -90,7 +91,7 @@ void ArgMaxKernel(const Context& dev_ctx, static_cast(0)); } else { r = xpu::argmax(dev_ctx.x_context(), - x.data(), + reinterpret_cast(x.data()), out_int64.data(), xdims_vec, axis_val); @@ -116,6 +117,12 @@ void ArgMaxKernel(const Context& dev_ctx, } } } // namespace phi -PD_REGISTER_KERNEL(argmax, XPU, ALL_LAYOUT, phi::ArgMaxKernel, float) { +PD_REGISTER_KERNEL(argmax, + XPU, + ALL_LAYOUT, + phi::ArgMaxKernel, + float, + int, + phi::dtype::float16) { kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } From ca72aa2ac25c0427677ae1eb85a00993b2b6fd0c Mon Sep 17 00:00:00 2001 From: jiangfan06 <117341294+MuShangCC@users.noreply.github.com> Date: Tue, 25 Jul 2023 10:28:33 +0800 Subject: [PATCH 21/34] Fix reduce_ops for mixed-precision FP16 support (#55573) --- paddle/fluid/framework/ir/auto_mixed_precision_pass.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index 6e12cf00e903b..6139ccb313788 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -415,7 +415,8 @@ void AutoMixedPrecisionPass::GetOpPrecision() const { auto out_dtype = op_node->Op()->GetAttrIfExists("out_dtype"); support_low_precision = support_low_precision && - IsFP32AndFP64(static_cast(out_dtype)); + (IsFP32AndFP64(static_cast(out_dtype)) || + out_dtype == -1); } // If scale op's "scale" and "bias" attr value exceed the range of fp16 From 05a40691e2d1de7244140ab486cf2fe32bfc5729 Mon Sep 17 00:00:00 2001 From: HongyuJia Date: Tue, 25 Jul 2023 10:31:13 +0800 Subject: [PATCH 22/34] [0D-Tensor] Fix test_elementwise_max_op unittest (#55674) --- test/legacy_test/test_elementwise_max_op.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test/legacy_test/test_elementwise_max_op.py b/test/legacy_test/test_elementwise_max_op.py index 40aeaa50a0e1c..2d202f1c0843a 100644 --- a/test/legacy_test/test_elementwise_max_op.py +++ b/test/legacy_test/test_elementwise_max_op.py @@ -124,9 +124,6 @@ def init_data(self): self.x = np.random.uniform(0.1, 1, []).astype("float64") self.y = np.random.uniform(0.1, 1, []).astype("float64") - def if_enbale_cinn(self): - self.enable_cinn = False - class TestElementwiseMaxFP16Op_ZeroDim1(TestElementwiseFP16Op): def init_data(self): @@ -142,9 +139,6 @@ def init_data(self): self.x = np.random.uniform(0.1, 1, [13, 17]).astype("float64") self.y = np.random.uniform(0.1, 1, []).astype("float64") - def if_enbale_cinn(self): - self.enable_cinn = False - class TestElementwiseMaxFP16Op_ZeroDim2(TestElementwiseFP16Op): def init_data(self): @@ -160,9 +154,6 @@ def init_data(self): self.x = np.random.uniform(0.1, 1, []).astype("float64") self.y = np.random.uniform(0.1, 1, [13, 17]).astype("float64") - def if_enbale_cinn(self): - self.enable_cinn = False - class TestElementwiseMaxFP16Op_ZeroDim3(TestElementwiseFP16Op): def init_data(self): From 03a2f1878cc37efabe55e3dbdf9c08f80019c0e1 Mon Sep 17 00:00:00 2001 From: qiuwenbo Date: Tue, 25 Jul 2023 10:34:28 +0800 Subject: [PATCH 23/34] =?UTF-8?q?=E8=A7=A3=E5=86=B3=20grad=5Ffn=20next=5Ff?= =?UTF-8?q?unctions=20api=20=E6=8E=A5=E5=8F=A3=E5=AF=BC=E8=87=B4=E5=86=85?= =?UTF-8?q?=E5=AD=98=E5=BC=82=E5=B8=B8=E7=9A=84=E9=97=AE=E9=A2=98=20-=20?= =?UTF-8?q?=20(#55627)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [尝试] 给tensor增加一个属性, 这个属性是一个定值 1 * 暴露gradnode 并构建gradnode新的方法(用来测试)进行暴露给python python端可以访问 * 开发grad_fn、next_functions两个API 并暴露到python端- 做一些规范化处理 * 增加一个单元测试 * 优化 code-style * 将单侧文件迁到正确的位置 * 优化 code-style * 删除无用注释 * 解决 __main__ has no attribute * 修改单侧文件 * 修改单侧脚本-temp * 解决 grad_fn next_functions api 接口导致内存异常的问题 * 修改单测内容 * 解决 code-style 问题 --- paddle/fluid/pybind/eager_properties.cc | 9 ++++--- paddle/fluid/pybind/eager_utils.cc | 5 ++-- paddle/fluid/pybind/eager_utils.h | 2 +- paddle/fluid/pybind/pybind.cc | 24 ++++++++++++++----- .../test_grad_fn_and_next_functions.py | 5 ++++ 5 files changed, 30 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 42d53ad7bee01..2a7692ee99bb3 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -317,17 +317,16 @@ PyObject* tensor_properties_get_grad_fn(TensorObject* self, void* closure) { if (meta) { // Get the GradNode from meta - auto grad_node = meta->GradNode(); // Convert GradNode to a Python object - // The conversion will depend on the structure of GradNode. - - if (!grad_node) { + auto grad_node_ptr = meta->GetMutableGradNode(); + if (!grad_node_ptr) { Py_INCREF(Py_None); return Py_None; } - PyObject* py_grad_node = ToPyObject(grad_node); + PyObject* py_grad_node = ToPyObject(grad_node_ptr); return py_grad_node; + } else { // If meta does not exist, return an appropriate Python object (e.g., None // or a special value). diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 8dfc7cfc8e426..ee270042f4176 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -1006,10 +1006,9 @@ paddle::optional GetOptionalTensorFromArgs( } } -PyObject* ToPyObject(egr::GradNodeBase* grad_node) { +PyObject* ToPyObject(std::shared_ptr grad_node) { py::object py_obj = py::cast(grad_node, py::return_value_policy::reference); - py::handle py_handle = py::handle(py_obj); - PyObject* py_grad_node = py_handle.ptr(); + PyObject* py_grad_node = py_obj.release().ptr(); Py_INCREF(py_grad_node); return py_grad_node; } diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 1fb53a3b9f7a6..f50ec9395b2f1 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -126,7 +126,7 @@ PyObject* ToPyObject( const std::unordered_map>& value); PyObject* ToPyObject(const paddle::framework::Vocab& value); -PyObject* ToPyObject(egr::GradNodeBase* grad_node); +PyObject* ToPyObject(std::shared_ptr grad_node); class PyTensorHook : public egr::TensorHook { public: diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d55cab98b1eba..504e1adf22569 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -778,12 +778,24 @@ PYBIND11_MODULE(libpaddle, m) { } }); - py::class_(m, "GradNodeBase") - .def("name", &egr::GradNodeBase::name) - .def_property_readonly("next_functions", - &egr::GradNodeBase::NextFunctions) - .def("input_meta", &egr::GradNodeBase::InputMeta) - .def("output_meta", &egr::GradNodeBase::OutputMeta); + py::class_>( + m, "GradNodeBase") + .def("name", + [](const std::shared_ptr &self) { + return self->name(); + }) + .def_property_readonly( + "next_functions", + [](const std::shared_ptr &self) { + return self->NextFunctions(); + }) + .def("input_meta", + [](const std::shared_ptr &self) { + return self->InputMeta(); + }) + .def("output_meta", [](const std::shared_ptr &self) { + return self->OutputMeta(); + }); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("cudnn_version", &platform::DnnVersion); diff --git a/test/legacy_test/test_grad_fn_and_next_functions.py b/test/legacy_test/test_grad_fn_and_next_functions.py index 5464775001253..531cdfa98a070 100644 --- a/test/legacy_test/test_grad_fn_and_next_functions.py +++ b/test/legacy_test/test_grad_fn_and_next_functions.py @@ -83,6 +83,11 @@ def check_func(self, grad_fn, grad_fn_json) -> None: grad_fn_json (dict): grad_node_json of node """ self.assertEqual(grad_fn.name(), grad_fn_json["func_name"]) + # Recursively test other nodes + if hasattr(grad_fn, 'next_functions') and grad_fn.next_functions[0]: + next_funcs_json = grad_fn_json["next_funcs"] + for u in grad_fn.next_functions: + self.check_func(u, next_funcs_json[u.name()]) if __name__ == "__main__": From 98c7a3e0bb50025b98b82a95f6580882b9334fa8 Mon Sep 17 00:00:00 2001 From: kangguangli Date: Tue, 25 Jul 2023 10:47:28 +0800 Subject: [PATCH 24/34] [BugFix] fix random fail of test_bilinear_interp_v2_op (#55643) * fix random fail of test_bilinear_interp_v2_op * reset if compiledProgram --- test/legacy_test/eager_op_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py index 838e8de35235c..489d8ce02d7dd 100644 --- a/test/legacy_test/eager_op_test.py +++ b/test/legacy_test/eager_op_test.py @@ -1211,13 +1211,21 @@ def _check_ir_output(self, place, program, feed_map, fetch_list, outs): return set_flags({"FLAGS_enable_new_ir_in_executor": True}) - + new_scope = paddle.static.Scope() executor = Executor(place) + new_program = None + if isinstance(program, paddle.static.CompiledProgram): + new_program = fluid.CompiledProgram( + program._program, build_strategy=program._build_strategy + ) + else: + new_program = program.clone() ir_outs = executor.run( - program, + new_program, feed=feed_map, fetch_list=fetch_list, return_numpy=False, + scope=new_scope, ) assert len(outs) == len( ir_outs From 057202574bf2c4e15d4bb5f4d313488af3870202 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 25 Jul 2023 10:57:27 +0800 Subject: [PATCH 25/34] Call multiply_ instead of scale_ to avoid multiple DtoH copy. (#55589) * Call multiply_ instead of scale_ to avoid multiple DtoH copy. * Call _squared_l2_norm to calculate grad_clip. * Fix import error. --- .../dygraph_optimizer/hybrid_parallel_optimizer.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index b24247b580766..cef28af4d4795 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -103,8 +103,7 @@ def _dygraph_clip(self, params_grads): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = clip.merge_selected_rows(g) merge_grad = clip.get_tensor_from_selected_rows(merge_grad) - square = paddle.square(merge_grad) - sum_square = paddle.sum(square) + sum_square = clip._squared_l2_norm(merge_grad) not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( hasattr(p, 'is_firstly_shared') @@ -230,15 +229,15 @@ def _dygraph_clip(self, params_grads): if getattr(p, 'need_clip', True) is False: continue if g.dtype == paddle.float16: - g.scale_(clip_var_fp16) + g.multiply_(clip_var_fp16) elif g.dtype == paddle.bfloat16: if paddle.is_compiled_with_xpu(): raise NotImplementedError( "BF16 is not supported on XPU now" ) - g.scale_(clip_var_bf16) + g.multiply_(clip_var_bf16) else: - g.scale_(clip_var) + g.multiply_(clip_var) p._reset_grad_inplace_version(True) return params_grads From fb9bec5d78fbd8d6500055f77b302dd63afc4ccf Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 25 Jul 2023 13:35:14 +0800 Subject: [PATCH 26/34] [NewIR]new ir dygraph to static supoort gpu (#55620) * add kernel dialect * change DenseTensorTypeStorage to DenseTensorType * add test case` * add first pd_op to kernel dialect * lower pd op to kernel dialect * update * update * remove useless code * add attrite print test * fix bug * update * update * update * update * polish code * fix bug * polish code and add python test * add test * fix test error * relax constraint when inserting get_parameter * add env flag * fix bug * dygraph2static support new ir * fix bug * revert test env * change cc_test_old to cc_test * update * fix build_static bug * update test * fix type test error * udpate cmake * disable test in windows * fix inference compile * fix program translator error * only run on cpu, not support gpu yet * fix conflict * polish code * fix bug * add feed with place op * update * remove useless unitest * udpate mkldnn * update * update * align mkldnn version * new ir support builtin slice op * fix bug * fix phi kernel adaptor bug * add enable static * add enable_static * remove useless test case * change feed list to single variable * update * add feed with place and shaddow output op * fix bug * remove usless code * support gpu * fix bug * fix bug * remove template * add more data type * fix cimpile bug * udpate * remove useless code * revert dygraph2st test * remove usless code * revert op * fix bug * new ir dygraph2static support gpu * remove usless code * code polish * add const * revert code and remove useless code * revert code * revert legacy op yaml * remove useless code * delete std::move --------- Co-authored-by: kangguangli --- .../eager/to_static/run_program_op_node.h | 59 +++++-- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/executor_cache.cc | 167 +++++++++++++++++- paddle/fluid/framework/executor_cache.h | 26 ++- .../interpreter/interpreter_util.cc | 3 +- paddle/fluid/framework/tensor_util.cc | 2 +- .../ir/phi_kernel_adaptor/phi_kernel_util.cc | 30 +++- .../ir/transforms/pd_op_to_kernel_pass.cc | 14 ++ .../ir_adaptor/translator/op_translator.cc | 62 +++++++ .../translator/program_translator.cc | 8 + paddle/phi/api/yaml/op_compat.yaml | 7 + paddle/phi/api/yaml/ops.yaml | 22 +++ paddle/phi/api/yaml/static_ops.yaml | 12 -- .../phi/kernels/cpu/feed_with_place_kernel.cc | 8 + paddle/phi/kernels/feed_with_place_kernel.h | 6 + test/ir/new_ir/test_feed_with_place.py | 1 + test/ir/new_ir/test_standalone_new_ir.py | 65 ++++++- 17 files changed, 459 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 574adc1f9d4d4..a8e47953f65dd 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -19,12 +19,16 @@ #include "paddle/fluid/eager/tensor_wrapper.h" #include "paddle/fluid/framework/new_executor/interpretercore.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h" +#include "paddle/fluid/ir_adaptor/translator/program_translator.h" #include "paddle/fluid/operators/run_program_op.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/ir/core/program.h" #include "paddle/ir/core/value.h" +PHI_DECLARE_bool(enable_new_ir_in_executor); + namespace details { using Tensor = paddle::Tensor; @@ -367,16 +371,32 @@ inline void RunProgramAPI( details::ShareTensorsIntoScope(x, global_inner_scope); details::ShareTensorsIntoScope(params, global_inner_scope); // Step 2. create new interpretercore - interpreter_core = - paddle::framework::CreateInterpreterCoreInfoToCache(*forward_program, - place, - /*is_grad=*/false, - program_id, - global_inner_scope); + + if (FLAGS_enable_new_ir_in_executor) { + // build new ir program + auto ir_program = paddle::framework::ConstructFowardIrProgram( + forward_global_block, backward_global_block, output_names, x); + interpreter_core = + paddle::framework::CreateNewIRInterpreterCoreInfoToCache( + std::move(ir_program), + place, + /*is_grad=*/false, + program_id, + global_inner_scope); + } else { + interpreter_core = + paddle::framework::CreateProgramInterpreterCoreInfoToCache( + *forward_program, + place, + /*is_grad=*/false, + program_id, + global_inner_scope); + } // Step 3. get all eager gc vars std::set skip_eager_delete_vars = paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( *backward_program); + // all out_vars are skip_eager_var skip_eager_delete_vars.insert(output_names.begin(), output_names.end()); skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end()); @@ -504,12 +524,27 @@ inline void RunProgramGradAPI( 1); VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; details::ShareTensorsIntoScope(out_grad, global_inner_scope); - interpreter_core = - paddle::framework::CreateInterpreterCoreInfoToCache(*backward_program, - place, - /*is_grad=*/true, - program_id, - global_inner_scope); + + if (FLAGS_enable_new_ir_in_executor) { + auto res = paddle::framework::ConstructBackwardIrProgram( + backward_global_block, out_grad, x_grad, params_grad); + + interpreter_core = + paddle::framework::CreateNewIRInterpreterCoreInfoToCache( + std::move(res), + place, + /*is_grad=*/true, + program_id, + global_inner_scope); + } else { + interpreter_core = + paddle::framework::CreateProgramInterpreterCoreInfoToCache( + *backward_program, + place, + /*is_grad=*/true, + program_id, + global_inner_scope); + } // share threadpool // NOTE(zhiqiu): this only works interpreter_core is executed strictly diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 4137518cf69d4..41b681afb5400 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1033,7 +1033,8 @@ cc_library( cc_library( executor_cache SRCS executor_cache.cc - DEPS parallel_executor standalone_executor) + DEPS parallel_executor standalone_executor phi_kernel_adaptor + pd_op_to_kernel_pass ir) if(WITH_PSCORE) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) if(WITH_HETERPS) diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index 9e8f4a25873d1..506ce36e47242 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/framework/executor_cache.h" #include "paddle/fluid/framework/new_executor/interpretercore.h" #include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h" +#include "paddle/fluid/ir_adaptor/translator/translate.h" #include "paddle/ir/core/program.h" #include "paddle/ir/core/value.h" @@ -288,7 +290,7 @@ InterpreterCoreInfoCache &InterpreterCoreInfoCache::Instance() { return g_info_cache; } -std::shared_ptr CreateInterpreterCoreInfoToCache( +std::shared_ptr CreateProgramInterpreterCoreInfoToCache( const ProgramDesc &program_desc, const platform::Place &place, bool is_grad, @@ -304,13 +306,172 @@ std::shared_ptr CreateInterpreterCoreInfoToCache( interpreter::ExecutionConfig execution_config; execution_config.create_local_scope = false; execution_config.used_for_jit = true; - auto core = std::make_shared( - place, program_desc.Block(0), scope, execution_config); + + std::shared_ptr core = nullptr; + + core.reset(new InterpreterCore( + place, program_desc.Block(0), scope, execution_config)); + + auto &cached_value = + interpretercore_info_cache.GetMutable(program_id, is_grad); + cached_value.core_ = core; + return core; +} + +std::shared_ptr CreateNewIRInterpreterCoreInfoToCache( + std::unique_ptr<::ir::Program> ir_program, + const platform::Place &place, + bool is_grad, + int64_t program_id, + framework::Scope *scope) { + auto &interpretercore_info_cache = + framework::InterpreterCoreInfoCache::Instance(); + if (interpretercore_info_cache.Size() > 10u /* max_cached_size*/) { + VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear " + "all cache!"; + interpretercore_info_cache.Finalize(); + } + interpreter::ExecutionConfig execution_config; + execution_config.create_local_scope = false; + execution_config.used_for_jit = true; + + std::shared_ptr core = nullptr; + + core.reset(new InterpreterCore( + place, std::move(ir_program), scope, execution_config)); + auto &cached_value = interpretercore_info_cache.GetMutable(program_id, is_grad); cached_value.core_ = core; return core; } +std::unique_ptr<::ir::Program> ConstructFowardIrProgram( + const paddle::framework::BlockDesc *forward_global_block, + const paddle::framework::BlockDesc *backward_global_block, + const std::vector output_names, + const std::vector &x) { + auto ir_ctx = ::ir::IrContext::Instance(); + auto program = std::make_unique<::ir::Program>(ir_ctx); + + std::set set_output_names; + auto local_program = + paddle::framework::ProgramDesc(*(forward_global_block->Program())); + + for (auto op_desc : local_program.Block(0).AllOps()) { + for (const auto &n : op_desc->Outputs()) { + const auto &input_var_names = n.second; + for (const auto &var_name : input_var_names) { + set_output_names.insert(var_name); + } + } + } + + // add fetch with place op to program + for (auto &in_t : x) { + auto name = in_t.name(); + auto place = in_t.place().GetType(); + + auto op_desc = local_program.MutableBlock(0)->PrependOp(); + op_desc->SetType("feed_with_place"); + op_desc->SetAttr("index", 0); + // TODO(phlrain) : using tensor dtype + op_desc->SetAttr("dtype", 0); + op_desc->SetAttr("place", static_cast(place)); + op_desc->SetAttr("name", name); + op_desc->SetOutput("out", {name}); + } + + std::set set_parameter_names; + for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) { + for (const auto &n : op_desc->Inputs()) { + const auto &input_var_names = n.second; + for (const auto &var_name : input_var_names) { + set_parameter_names.insert(var_name); + } + } + } + + for (auto &t : output_names) { + set_parameter_names.insert(t); + } + + for (auto &name : set_parameter_names) { + if (!set_output_names.count(name)) { + continue; + } + + auto op_desc = local_program.MutableBlock(0)->AppendOp(); + op_desc->SetType("shaddow_output"); + op_desc->SetAttr("name", name); + op_desc->SetInput("x", {name}); + op_desc->SetOutput("out", {"@EMPTY@"}); + } + + paddle::translator::ProgramTranslator program_translator(&local_program, + program.get()); + + program_translator.Translate(); + + auto ir_res = paddle::dialect::PdOpLowerToKernelPass(program.get()); + + return ir_res; +} + +std::unique_ptr<::ir::Program> ConstructBackwardIrProgram( + const paddle::framework::BlockDesc *backward_global_block, + const std::vector &out_grad, + const std::vector &x_grad, + const std::vector ¶ms_grad) { + auto ir_ctx = ::ir::IrContext::Instance(); + auto program = std::make_unique<::ir::Program>(ir_ctx); + + auto local_program = + paddle::framework::ProgramDesc(*(backward_global_block->Program())); + // add feed kernel + for (auto &out_grad_t : out_grad) { + auto name = out_grad_t.name(); + auto place = out_grad_t.place().GetType(); + if (name == "@EMPTY@") { + continue; + } + auto op_desc = local_program.MutableBlock(0)->PrependOp(); + op_desc->SetType("feed_with_place"); + op_desc->SetAttr("index", 0); + // TODO(phlrain) : using tensor dtype + op_desc->SetAttr("dtype", 0); + op_desc->SetAttr("place", static_cast(place)); + op_desc->SetAttr("name", name); + op_desc->SetOutput("out", {name}); + } + + std::vector param_grad_names; + for (auto &p_g : params_grad) { + param_grad_names.push_back(p_g->name()); + } + + for (auto &t : x_grad) { + param_grad_names.push_back(t->name()); + } + for (auto &name : param_grad_names) { + if (name == "@EMPTY@") { + continue; + } + auto op_desc = local_program.MutableBlock(0)->AppendOp(); + op_desc->SetType("shaddow_output"); + op_desc->SetAttr("name", name); + op_desc->SetInput("x", {name}); + op_desc->SetOutput("out", {"@EMPTY@"}); + } + + paddle::translator::ProgramTranslator program_translator(&local_program, + program.get()); + program_translator.Translate(); + + auto res = paddle::dialect::PdOpLowerToKernelPass(program.get()); + + return res; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h index f4d926d74c146..c639b966286cb 100644 --- a/paddle/fluid/framework/executor_cache.h +++ b/paddle/fluid/framework/executor_cache.h @@ -29,6 +29,11 @@ #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/fluid/ir_adaptor/translator/program_translator.h" +#include "paddle/ir/core/dialect.h" +#include "paddle/ir/core/ir_context.h" +#include "paddle/ir/core/program.h" + namespace paddle { namespace framework { namespace ir { @@ -218,12 +223,31 @@ class InterpreterCoreInfoCache { std::unordered_map info_map_; }; -std::shared_ptr CreateInterpreterCoreInfoToCache( +std::shared_ptr CreateProgramInterpreterCoreInfoToCache( const ProgramDesc& program_desc, const platform::Place& place, bool is_grad, int64_t program_id, framework::Scope* scope); +std::shared_ptr CreateNewIRInterpreterCoreInfoToCache( + std::unique_ptr<::ir::Program> ir_prog, + const platform::Place& place, + bool is_grad, + int64_t program_id, + framework::Scope* scope); + +std::unique_ptr<::ir::Program> ConstructFowardIrProgram( + const paddle::framework::BlockDesc* forward_global_block, + const paddle::framework::BlockDesc* backward_global_block, + const std::vector output_names, + const std::vector& x); + +std::unique_ptr<::ir::Program> ConstructBackwardIrProgram( + const paddle::framework::BlockDesc* backward_global_block, + const std::vector& out_grad, + const std::vector& x_grad, + const std::vector& params_grad); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 70be3b9dd035a..035f4cd4f16d9 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -958,7 +958,8 @@ void BuildOpFuncList( if (op_name == "builtin.combine" || op_name == "pd.feed" || op_name == "builtin.set_parameter" || - op_name == "builtin.get_parameter" || op_name == "builtin.slice") { + op_name == "builtin.get_parameter" || op_name == "builtin.slice" || + op_name == "pd.feed_with_place" || op_name == "pd.shaddow_output") { VLOG(6) << "skip process " << op_name; continue; } diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 6552a14a03fcc..d4421ed7ab009 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -984,7 +984,7 @@ std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) { do { \ if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \ proto_type) { \ - os << " - dtype: " << proto_type << "\n"; \ + os << " - dtype: " << tensor.dtype() << "\n"; \ paddle::framework::print_tensor(os, tensor); \ return os; \ } \ diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index 1a880210afbe1..95702ac672113 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -66,8 +66,10 @@ paddle::framework::Variable* CreateVar( } paddle::framework::Variable* var = nullptr; + std::string name = var_name_prefix + "_inner_var_" + std::to_string(variable_2_var_name->size()); + if (force_persisable || is_persisable) { VLOG(6) << "Create var: " << name << " in scope " << inner_scope->root(); var = const_cast(inner_scope->root())->Var(name); @@ -202,6 +204,15 @@ void HandleForSpecialOp( value_2_var_name->emplace(value, feed_var_name); } + if (op_name == "pd.feed_with_place") { + VLOG(6) << "Handle for pd.feed_with_place"; + auto var_name = + op->attributes().at("name").dyn_cast().AsString(); + + auto value = op->result(0); + value_2_var_name->emplace(value, var_name); + } + if (op_name == "builtin.combine") { auto out_value = op->result(0); @@ -252,6 +263,22 @@ void HandleForSpecialOp( (*value_2_var_name)[value] = param_name; } + if (op_name == "pd.shaddow_output") { + VLOG(6) << "Handle for pd.shaddow_ouptut"; + auto var_name = + op->attributes().at("name").dyn_cast().AsString(); + + auto value = op->operand(0); + // change opreand name to param_name + auto orig_name = value_2_var_name->at(value); + + if (inner_scope->root()->FindVar(var_name) == nullptr) { + const_cast(inner_scope->root()) + ->Rename(orig_name, var_name); + } + (*value_2_var_name)[value] = var_name; + } + if (op_name == "builtin.get_parameter") { VLOG(6) << "Handle for builtin.get_parameter:"; auto param_name = op->attributes() @@ -362,7 +389,8 @@ void BuildScope(const ir::Block& block, if (op_name == "pd.feed" || op_name == "pd.fetch" || op_name == "builtin.combine" || op_name == "builtin.set_parameter" || - op_name == "builtin.get_parameter" || op_name == "builtin.slice") { + op_name == "builtin.get_parameter" || op_name == "builtin.slice" || + op_name == "pd.feed_with_place" || op_name == "pd.shaddow_output") { HandleForSpecialOp(op, inner_scope, var_name_prefix, diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc index d55ce6b24f9cf..beb4635bebba4 100644 --- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc @@ -62,6 +62,20 @@ phi::KernelKey GetKernelKey( TransToPhiDataType( op->result(0).type().dyn_cast().dtype())}; } + + if (op->name() == "pd.feed_with_place") { + // NOTE, for now feed op don't need a kernel, so the data type from Op + // Result the next op use base program datatype + auto t = + op->attributes().at("place").dyn_cast().data(); + + auto backend = paddle::experimental::ParseBackend(t); + return {backend, + phi::DataLayout::ANY, + TransToPhiDataType( + op->result(0).type().dyn_cast().dtype())}; + } + phi::Backend kernel_backend = phi::Backend::UNDEFINED; phi::DataLayout kernel_layout = phi::DataLayout::UNDEFINED; phi::DataType kernel_data_type = phi::DataType::UNDEFINED; diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc index 0aab57af7998a..ee2f66692eda8 100644 --- a/paddle/fluid/ir_adaptor/translator/op_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc @@ -954,6 +954,39 @@ struct FeedOpTranscriber : public OpTranscriber { } }; +struct FeedWithPlaceOpTranscriber : public OpTranscriber { + ir::AttributeMap TranslateOpAttribute( + ir::IrContext* ctx, + const std::string& normalized_op_name, + const OpAttributeInfoList& op_attr_infos, + const OpDesc& op_desc) override { + int allocate_type = paddle::get(op_desc.GetAttr("place")); + ir::AttributeMap attribute_map = { + {"name", + ir::StrAttribute::get(ctx, + op_desc.GetAttrIfExists("name"))}, + {"index", ir::Int64Attribute::get(ctx, 0)}, + {"dtype", + paddle::dialect::DataTypeAttribute::get(ctx, phi::DataType::FLOAT32)}, + {"place", + paddle::dialect::PlaceAttribute::get( + ctx, phi::Place(static_cast(allocate_type)))}, + }; + + return attribute_map; + } + + std::vector GenerateOperationInput( + ir::IrContext* ctx, + TranslationContext* param_map, + const OpDesc& op_desc, + const std::string& normalized_op_name, + const OpInputInfoList& input_infos, + ir::Program* program) override { + return {}; + } +}; + struct SplitOpTranscriber : public OpTranscriber { std::vector GenerateOperationInput( ir::IrContext* ctx, @@ -1087,6 +1120,32 @@ struct FetchOpTranscriber : public OpTranscriber { } }; +struct ShaddowOutputOpTranscriber : public OpTranscriber { + ir::Operation* operator()(ir::IrContext* ctx, + TranslationContext* param_map, + const OpDesc& op_desc, + ir::Program* program) override { + std::vector op_inputs; + auto legacy_input_vars = op_desc.Input("x", true); + + auto defining_info = (*param_map)[legacy_input_vars[0]]; + op_inputs.push_back(defining_info.value); + + ir::AttributeMap attribute_map = { + {"parameter_name", + ir::StrAttribute::get(ctx, + op_desc.GetAttrIfExists("name"))}, + }; + + auto create_op_info = ctx->GetRegisteredOpInfo(ir::SetParameterOp::name()); + ir::Operation* operation = + ir::Operation::Create(op_inputs, attribute_map, {}, create_op_info); + program->block()->push_back(operation); + + return operation; + } +}; + // NOTE, add_n op in legacy ops don't have a kernel, so we use a new op for now struct AddNOpTranscriber : public OpTranscriber { ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override { @@ -1159,6 +1218,7 @@ struct OneHotTranscriber : public OpTranscriber { OpTranslator::OpTranslator() { general_handler = OpTranscriber(); special_handlers["feed"] = FeedOpTranscriber(); + special_handlers["feed_with_place"] = FeedWithPlaceOpTranscriber(); special_handlers["fetch_v2"] = FetchOpTranscriber(); special_handlers["cast"] = CastOpTranscriber(); special_handlers["split"] = SplitOpTranscriber(); @@ -1167,8 +1227,10 @@ OpTranslator::OpTranslator() { special_handlers["assign_value"] = AssignValueOpTranscriber(); special_handlers["increment"] = IncrementOpTranscriber(); special_handlers["rnn"] = RnnOpTranscriber(); + special_handlers["shaddow_output"] = ShaddowOutputOpTranscriber(); special_handlers["one_hot_v2"] = OneHotTranscriber(); special_handlers["add_n"] = AddNOpTranscriber(); + special_handlers["sum"] = AddNOpTranscriber(); } } // namespace translator diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc index b162e8198b993..202cfc61dd304 100644 --- a/paddle/fluid/ir_adaptor/translator/program_translator.cc +++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc @@ -217,7 +217,15 @@ void ProgramTranslator::SetStopGradientAttributeForAllValue( continue; } ir::OpResult value = value_info.value; + if (!value) { + PADDLE_THROW(phi::errors::PreconditionNotMet( + "Value of [%s] can not ber None", var_name)); + } auto* defining_op = value.owner(); + PADDLE_ENFORCE_NOT_NULL( + defining_op, + phi::errors::PreconditionNotMet( + "Defining operator of [%s] can not be nullptr", var_name)); VLOG(8) << "[op translated][stop gradient]" << var_name << " from: " << defining_op->name(); std::vector stop_gradients; diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index add6520493e1f..ed7c9d4237396 100755 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -1029,6 +1029,9 @@ - op : feed outputs: {out: Out} +- op : feed_with_place + outputs: {out: out} + - op : fft_c2c inputs: {x: X} outputs: {out: Out} @@ -2461,6 +2464,10 @@ extra : attrs : [bool use_mkldnn=false] +- op : shaddow_output + inputs: {x: x} + outputs: {out: out} + - op : shape inputs : input : Input diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 661de64990ee6..8368184b2839d 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -826,6 +826,18 @@ inplace: (x -> out) backward : expm1_grad +- op : feed_with_place + args : (int64_t index, DataType dtype, str name, Place place) + output : Tensor(out) + infer_meta : + func : FeedWithPlaceInferMeta + param : [index, dtype] + kernel: + func : feed_with_place + param : [index, dtype] + data_type : dtype + backend : place + - op : fft_c2c args : (Tensor x, int64_t[] axes, str normalization, bool forward) output : Tensor @@ -2212,6 +2224,16 @@ optional : master_param, master_param_out inplace : (param -> param_out), (master_param -> master_param_out) +- op : shaddow_output + args : (Tensor x, str name) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel: + func : shaddow_output + param : [x] + - op : shape args : (Tensor input) output : Tensor(out) diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml index 5ac156ff5714d..216fca178fde7 100755 --- a/paddle/phi/api/yaml/static_ops.yaml +++ b/paddle/phi/api/yaml/static_ops.yaml @@ -244,18 +244,6 @@ param : [num_rows, num_columns, dtype] data_type : dtype -- op : feed_with_place - args : (int64_t index, DataType dtype, Place place) - output : Tensor(out) - infer_meta : - func : FeedWithPlaceInferMeta - param : [index, dtype] - kernel: - func : feed_with_place - param : [index, dtype] - data_type : dtype - backend : place - - op : floor_divide args : (Tensor x, Tensor y, int axis = -1) output : Tensor(out) diff --git a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc index 342ad6a334cc3..5044bceda26bd 100644 --- a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc +++ b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc @@ -26,6 +26,11 @@ void FeedWithPlaceKernel(const Context& ctx, phi::DataType data_type, DenseTensor* out) {} +template +void ShaddowOutputKernel(const Context& ctx, + const DenseTensor& x, + DenseTensor* out) {} + } // namespace phi PD_REGISTER_KERNEL( @@ -44,3 +49,6 @@ PD_REGISTER_KERNEL(shaddow_feed, phi::bfloat16, phi::complex64, phi::complex128) {} + +PD_REGISTER_KERNEL( + shaddow_output, CPU, ALL_LAYOUT, phi::ShaddowOutputKernel, float) {} diff --git a/paddle/phi/kernels/feed_with_place_kernel.h b/paddle/phi/kernels/feed_with_place_kernel.h index 4e8e9063c8d9b..725ec0c508af1 100644 --- a/paddle/phi/kernels/feed_with_place_kernel.h +++ b/paddle/phi/kernels/feed_with_place_kernel.h @@ -22,6 +22,12 @@ template void FeedWithPlaceKernel(const Context& ctx, int64_t index, phi::DataType data_type, + // std::string name, + DenseTensor* out); + +template +void ShaddowOutputKernel(const Context& ctx, + const DenseTensor& x, DenseTensor* out); template diff --git a/test/ir/new_ir/test_feed_with_place.py b/test/ir/new_ir/test_feed_with_place.py index 5843fe227b1bf..222a5a86460b8 100644 --- a/test/ir/new_ir/test_feed_with_place.py +++ b/test/ir/new_ir/test_feed_with_place.py @@ -30,6 +30,7 @@ def feed_with_place(): 'index': 0, 'dtype': 0, 'place': 0, + 'name': "x", }, ) return out diff --git a/test/ir/new_ir/test_standalone_new_ir.py b/test/ir/new_ir/test_standalone_new_ir.py index c67370b2e0a2f..4a00c2960c286 100644 --- a/test/ir/new_ir/test_standalone_new_ir.py +++ b/test/ir/new_ir/test_standalone_new_ir.py @@ -19,11 +19,10 @@ import paddle -paddle.enable_static() - class TestNewIr(unittest.TestCase): def test_with_new_ir(self): + paddle.enable_static() place = ( paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() @@ -48,6 +47,7 @@ def test_with_new_ir(self): class TestCombineOp(unittest.TestCase): def test_with_new_ir(self): + paddle.enable_static() place = ( paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() @@ -72,6 +72,7 @@ def test_with_new_ir(self): class TestFeedOp(unittest.TestCase): def test_with_new_ir(self): + paddle.enable_static() place = ( paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() @@ -103,6 +104,7 @@ def test_with_new_ir(self): class TestSelectedRows(unittest.TestCase): def test_with_new_ir(self): + paddle.enable_static() # TODO(phlrain): support selected rows in GPU # place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace() place = paddle.CPUPlace() @@ -127,6 +129,7 @@ def test_with_new_ir(self): class TestAddGradOp(unittest.TestCase): def test_with_new_ir(self): + paddle.enable_static() place = ( paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() @@ -141,11 +144,9 @@ def test_with_new_ir(self): x = paddle.static.data("x", [2, 2], dtype="float32") y = paddle.static.data("y", [2, 2], dtype="float32") x.stop_gradient = False - z = x * y paddle.static.gradients(z, x) - np_a = np.random.rand(2, 2).astype("float32") np_b = np.random.rand(2, 2).astype("float32") out = exe.run( @@ -159,8 +160,63 @@ def test_with_new_ir(self): np.testing.assert_array_equal(out[0], gold_res) +class TestNewIrDygraph(unittest.TestCase): + def test_with_new_ir(self): + paddle.disable_static() + # paddle.device.set_device("cpu") + + @paddle.jit.to_static + def func(x, y): + return x + y + + x = paddle.ones([2, 2], dtype='float32') + y = paddle.ones([2, 2], dtype='float32') + z = func(x, y) + + gold_res = np.ones([2, 2], dtype="float32") * 2 + self.assertEqual( + np.array_equal( + z.numpy(), + gold_res, + ), + True, + ) + + +class TestNewIrBackwardDygraph(unittest.TestCase): + def test_with_new_ir(self): + paddle.disable_static() + build_strategy = paddle.static.BuildStrategy() + build_strategy.enable_inplace = False + + @paddle.jit.to_static(build_strategy=build_strategy) + def func(x, y): + return x * y + + x = paddle.ones([2, 2], dtype='float32') + y = paddle.ones([2, 2], dtype='float32') + x.stop_gradient = False + y.stop_gradient = False + z = func(x, y) + loss = z.mean() + loss.backward() + gold_res = np.ones([2, 2], dtype="float32") + self.assertEqual( + np.array_equal( + z.numpy(), + gold_res, + ), + True, + ) + + gold_res = np.ones([2, 2], dtype="float32") * 0.25 + np.testing.assert_array_equal(x.gradient(), gold_res) + np.testing.assert_array_equal(y.gradient(), gold_res) + + class TestSplitOp(unittest.TestCase): def test_with_new_ir(self): + paddle.enable_static() place = ( paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() @@ -186,4 +242,5 @@ def test_with_new_ir(self): if __name__ == "__main__": + paddle.enable_static() unittest.main() From 6093a7ed0a383737102e2c50c4b71aae456c0bf0 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Tue, 25 Jul 2023 13:41:19 +0800 Subject: [PATCH 27/34] Update ccache (#55136) * Update ccache * del 3.7.9 * fix error --- tools/dockerfile/Dockerfile.ubuntu20 | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tools/dockerfile/Dockerfile.ubuntu20 b/tools/dockerfile/Dockerfile.ubuntu20 index 98bc6ec80731a..f7aeb670bab65 100644 --- a/tools/dockerfile/Dockerfile.ubuntu20 +++ b/tools/dockerfile/Dockerfile.ubuntu20 @@ -147,13 +147,14 @@ RUN pip3.7 --no-cache-dir install -r /root/requirements.txt && \ #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config #CMD source ~/.bashrc -# ccache 3.7.9 -RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ - tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \ - ./configure -prefix=/usr/local/ccache-3.7.9 && \ +# ccache 4.2.0 +RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \ + tar xf ccache-4.8.2.tar.gz && mkdir /usr/local/ccache-4.8.2 && cd ccache-4.8.2 && \ + mkdir build && cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/ccache-4.8.2 .. && \ make -j8 && make install && \ - ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \ - cd ../ && rm -rf ccache-3.7.9 ccache-3.7.9.tar.gz + ln -s /usr/local/ccache-4.8.2/bin/ccache /usr/local/bin/ccache && \ + cd ../../ && rm -rf ccache-4.8.2.tar.gz # clang+llvm 3.8.0 RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ From 690ffe814dbfc5054d4e92df878687fd638fe3a5 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 25 Jul 2023 13:56:45 +0800 Subject: [PATCH 28/34] fix div 0 bug (#55644) --- paddle/phi/kernels/cpu/nanmedian_kernel.cc | 8 ++++++++ paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu | 12 ++++++++++++ paddle/phi/kernels/gpu/nanmedian_kernel.cu | 8 ++++++++ 3 files changed, 28 insertions(+) diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc index 558d5aaebfef8..92571124dd1a6 100644 --- a/paddle/phi/kernels/cpu/nanmedian_kernel.cc +++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc @@ -116,6 +116,14 @@ void ProcessMedianKernel(const Context& dev_ctx, auto x_dim = x.dims(); int64_t x_rank = x_dim.size(); int64_t stride = x_dim[x_rank - 1]; + + PADDLE_ENFORCE_NE( + stride, + 0, + phi::errors::InvalidArgument("The input Tensor x's shape[-1] should not " + "be 0, but shape is %s now.", + x_dim)); + int64_t pre_dim = numel / stride; int64_t i = 0; diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index e4ee1f342131a..33de3c8e17876 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -340,6 +340,18 @@ void MatrixRankTolKernel(const Context& dev_ctx, auto dim_out = out->dims(); int rows = dim_x[dim_x.size() - 2]; int cols = dim_x[dim_x.size() - 1]; + PADDLE_ENFORCE_NE( + rows, + 0, + phi::errors::InvalidArgument("The input Tensor x's shape[-2] should not " + "be 0, but shape is %s now.", + dim_x)); + PADDLE_ENFORCE_NE( + cols, + 0, + phi::errors::InvalidArgument("The input Tensor x's shape[-1] should not " + "be 0, but shape is %s now.", + dim_x)); int k = std::min(rows, cols); auto numel = x.numel(); int batches = numel / (rows * cols); diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu index 8a6be7a9bdef0..5a9d3a07cf55d 100644 --- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu +++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu @@ -149,6 +149,14 @@ void ProcessMedianKernel(const Context& dev_ctx, auto x_dim = x.dims(); int64_t x_rank = x_dim.size(); int64_t stride = x_dim[x_rank - 1]; + + PADDLE_ENFORCE_NE( + stride, + 0, + phi::errors::InvalidArgument("The input Tensor x's shape[-1] should not " + "be 0, but shape is %s now.", + x_dim)); + int64_t pre_dim = numel / stride; int64_t i = 0; From 0cd422b66ad449af65685db3a345a423f90dbd71 Mon Sep 17 00:00:00 2001 From: Lucas <33367939+cqulilujia@users.noreply.github.com> Date: Tue, 25 Jul 2023 14:25:42 +0800 Subject: [PATCH 29/34] fix bugs in rnn op (#55656) --- paddle/phi/kernels/xpu/rnn_kernel.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/xpu/rnn_kernel.cc b/paddle/phi/kernels/xpu/rnn_kernel.cc index 10fdfdbc4b91f..87773c8a97267 100644 --- a/paddle/phi/kernels/xpu/rnn_kernel.cc +++ b/paddle/phi/kernels/xpu/rnn_kernel.cc @@ -44,7 +44,7 @@ void RnnKernel(const Context& dev_ctx, } dropout_state->Resize(out->dims()); - dev_ctx.template Alloc(dropout_state); + dev_ctx.template Alloc(dropout_state); phi::funcs::SetConstant ones; ones(dev_ctx, dropout_state, static_cast(1)); @@ -97,7 +97,7 @@ void RnnKernel(const Context& dev_ctx, int gate_num = 4; int hidden_data_idx = (num_layers - 1); - hidden_data_idx += (gate_num + 1) * num_layers; + hidden_data_idx += (gate_num + 2) * num_layers; const int& block_size = direction_num * seq_len * batch_size * hidden_size; reserve->Resize({hidden_data_idx, block_size}); dev_ctx.template Alloc(reserve); From 7da1ffbe4086bbe9bbc9368baa29c04d3e40d4ac Mon Sep 17 00:00:00 2001 From: wentao yu Date: Tue, 25 Jul 2023 14:37:07 +0800 Subject: [PATCH 30/34] remove fluid allreduce op (#55672) --- .../operators/collective/allreduce_op.cc | 84 ----------------- .../operators/collective/allreduce_op.cu.cc | 28 ------ .../fluid/operators/collective/allreduce_op.h | 93 ------------------- 3 files changed, 205 deletions(-) delete mode 100644 paddle/fluid/operators/collective/allreduce_op.cc delete mode 100644 paddle/fluid/operators/collective/allreduce_op.cu.cc delete mode 100644 paddle/fluid/operators/collective/allreduce_op.h diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc deleted file mode 100644 index dc3dfff58e96e..0000000000000 --- a/paddle/fluid/operators/collective/allreduce_op.cc +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/allreduce_op.h" - -#include // NOLINT -#include - -namespace paddle { -namespace operators { - -class AllReduceDelOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override {} - - protected: - phi::KernelKey GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.GetPlace()); - } -}; - -class AllReduceDelOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), tensor to be allreduced."); - AddOutput("Out", "(Tensor) the result of allreduced."); - AddAttr("reduce_type", "(int) determine the reduce type.") - .SetDefault(0); - AddAttr( - "sync_mode", - "(bool) whether to synchronize the CUDA stream after nccl call.") - .SetDefault(false); - AddComment(R"DOC( -***AllReduce Operator*** - -Call NCCL AllReduce internally. Note that this op must be used when one -thread is managing one GPU device. - -For speed reasons, reduce_type should be an integer: - -0: sum -1: prod -2: max -3: min - -If input and output are the same variable, in-place allreduce will be used. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_WITHOUT_GRADIENT(allreduce, - ops::AllReduceDelOp, - ops::AllReduceDelOpMaker); - -PD_REGISTER_STRUCT_KERNEL(allreduce, - CPU, - ALL_LAYOUT, - ops::AllReduceOpKernel, - float, - double, - int, - int64_t, - plat::float16) {} diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc deleted file mode 100644 index 0c9b95c76866b..0000000000000 --- a/paddle/fluid/operators/collective/allreduce_op.cu.cc +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/allreduce_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -PD_REGISTER_STRUCT_KERNEL(allreduce, - GPU, - ALL_LAYOUT, - ops::AllReduceOpKernel, - float, - double, - int, - int64_t, - plat::float16) {} diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h deleted file mode 100644 index 794e37c312a9b..0000000000000 --- a/paddle/fluid/operators/collective/allreduce_op.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#endif - -namespace paddle { -namespace operators { - -template -class AllReduceOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto place = ctx.GetPlace(); - PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), - true, - platform::errors::PreconditionNotMet( - "AllReduce op can run on gpu place only for now.")); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto& dev_ctx = ctx.template device_context(); - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); - - int dtype = - platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype())); - int64_t numel = in->numel(); - auto* sendbuff = in->data(); - out->Resize(in->dims()); - void* recvbuff = out->mutable_data(place); - - auto* comm = dev_ctx.nccl_comm(); - // FIXME(typhoonzero): should use nccl stream here. - auto stream = dev_ctx.stream(); - PADDLE_ENFORCE_NOT_NULL( - stream, platform::errors::NotFound("Should initialize NCCL firstly.")); - - int reduce_type = ctx.Attr("reduce_type"); - ncclRedOp_t red_type = ncclSum; - switch (reduce_type) { - case 0: - red_type = ncclSum; - break; - case 1: - red_type = ncclProd; - break; - case 2: - red_type = ncclMax; - break; - case 3: - red_type = ncclMin; - break; - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::ncclAllReduce(sendbuff, - recvbuff, - numel, - static_cast(dtype), - red_type, - comm, - stream)); - if (ctx.Attr("sync_mode")) { - platform::GpuStreamSync(stream); - } -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif - } -}; - -} // namespace operators -} // namespace paddle From c737f0aedb64a3a62967ed8683f68f4fa677e8f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= Date: Tue, 25 Jul 2023 14:37:51 +0800 Subject: [PATCH 31/34] add all false bool indices support for index_put (#55655) --- .../phi/kernels/cpu/index_put_grad_kernel.cc | 14 +++ paddle/phi/kernels/cpu/index_put_kernel.cc | 6 + paddle/phi/kernels/funcs/index_put_utils.h | 5 + .../phi/kernels/gpu/index_put_grad_kernel.cu | 15 +++ paddle/phi/kernels/gpu/index_put_kernel.cu | 6 + test/legacy_test/test_index_put_op.py | 107 +++++++++++++++--- 6 files changed, 140 insertions(+), 13 deletions(-) diff --git a/paddle/phi/kernels/cpu/index_put_grad_kernel.cc b/paddle/phi/kernels/cpu/index_put_grad_kernel.cc index 9426a5def6527..89b3ba61b83d3 100644 --- a/paddle/phi/kernels/cpu/index_put_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/index_put_grad_kernel.cc @@ -16,6 +16,7 @@ #include #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/index_put_utils.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" @@ -188,6 +189,19 @@ void IndexPutGradKernel(const Context& dev_ctx, std::vector tmp_args; std::vector int_indices_v = funcs::DealWithBoolIndices(dev_ctx, indices, &tmp_args); + if (int_indices_v.empty()) { + if (x_grad) { + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + } + if (value_grad) { + FullKernel(dev_ctx, + phi::vectorize(value_grad->dims()), + 0.0f, + value_grad->dtype(), + value_grad); + } + return; + } auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v); std::vector res_dim_v(phi::vectorize(bd_dim)); diff --git a/paddle/phi/kernels/cpu/index_put_kernel.cc b/paddle/phi/kernels/cpu/index_put_kernel.cc index 3ab05fea8fa47..34701539c8ef3 100644 --- a/paddle/phi/kernels/cpu/index_put_kernel.cc +++ b/paddle/phi/kernels/cpu/index_put_kernel.cc @@ -117,6 +117,12 @@ void IndexPutKernel(const Context& dev_ctx, std::vector tmp_args; std::vector int_indices_v = funcs::DealWithBoolIndices(dev_ctx, indices, &tmp_args); + if (int_indices_v.empty()) { + if (!out->initialized()) { + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + } + return; + } auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v); diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h index c135cb82e2ec3..1e0c9eaaaf5a5 100644 --- a/paddle/phi/kernels/funcs/index_put_utils.h +++ b/paddle/phi/kernels/funcs/index_put_utils.h @@ -88,6 +88,11 @@ std::vector DealWithBoolIndices( nonzero_indices.Resize(phi::make_ddim({-1, rank})); NonZeroKernel(dev_ctx, *indices_v[i], &nonzero_indices); + if (nonzero_indices.numel() == 0) { + std::vector empty_indices; + return empty_indices; + } + std::vector integer_indices(rank, nullptr); const int tmp_ix = tmp_indices_v->size(); for (int i = 0; i < rank; ++i) { diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu index 1a301323cad87..8f2eba7185293 100644 --- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu @@ -18,6 +18,7 @@ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/index_put_utils.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" @@ -219,6 +220,20 @@ void IndexPutGradKernel(const Context& dev_ctx, std::vector tmp_args; std::vector int_indices_v = funcs::DealWithBoolIndices(dev_ctx, indices, &tmp_args); + if (int_indices_v.empty()) { + if (x_grad) { + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + } + if (value_grad) { + FullKernel(dev_ctx, + phi::vectorize(value_grad->dims()), + 0.0f, + value_grad->dtype(), + value_grad); + } + return; + } + const size_t total_dims = x.dims().size(); auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v); diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu index 9710f5baca77d..4244e755b6597 100644 --- a/paddle/phi/kernels/gpu/index_put_kernel.cu +++ b/paddle/phi/kernels/gpu/index_put_kernel.cu @@ -118,6 +118,12 @@ void IndexPutKernel(const Context& dev_ctx, std::vector tmp_args; std::vector int_indices_v = funcs::DealWithBoolIndices(dev_ctx, indices, &tmp_args); + if (int_indices_v.empty()) { + if (!out->initialized()) { + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + } + return; + } const size_t total_dims = x.dims().size(); auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v); diff --git a/test/legacy_test/test_index_put_op.py b/test/legacy_test/test_index_put_op.py index f21f7b084bde4..44e50c1567721 100644 --- a/test/legacy_test/test_index_put_op.py +++ b/test/legacy_test/test_index_put_op.py @@ -47,14 +47,15 @@ def has_duplicate_index(indices, shapes): return True -def gen_indices_np(x_shape, indices_shapes, index_type): +def gen_indices_np(x_shape, indices_shapes, index_type, is_all_false): indices = [] if index_type == np.bool_: indice = np.zeros(indices_shapes[0], dtype=np.bool_) - indice.flatten() - for i in range(len(indice)): - indice[i] = (i & 1) == 0 - indice = indice.reshape(indices_shapes[0]) + if not is_all_false: + indice.flatten() + for i in range(len(indice)): + indice[i] = (i & 1) == 0 + indice = indice.reshape(indices_shapes[0]) indices.append(indice) else: while True: @@ -78,6 +79,7 @@ def gen_indices_np(x_shape, indices_shapes, index_type): class TestIndexPutAPIBase(unittest.TestCase): def setUp(self): self.mixed_indices = False + self.is_all_false = False self.init_dtype_type() self.setPlace() self.x_np = np.random.random(self.x_shape).astype(self.dtype_np) @@ -85,17 +87,26 @@ def setUp(self): if self.mixed_indices: tmp_indices_np1 = gen_indices_np( - self.x_shape, self.indices_shapes, self.index_type_np + self.x_shape, + self.indices_shapes, + self.index_type_np, + self.is_all_false, ) tmp_indices_np2 = gen_indices_np( - self.x_shape, self.indices_shapes1, self.index_type_np1 + self.x_shape, + self.indices_shapes1, + self.index_type_np1, + self.is_all_false, ) self.indices_np = tuple( list(tmp_indices_np1) + list(tmp_indices_np2) ) else: self.indices_np = gen_indices_np( - self.x_shape, self.indices_shapes, self.index_type_np + self.x_shape, + self.indices_shapes, + self.index_type_np, + self.is_all_false, ) def init_dtype_type(self): @@ -565,6 +576,32 @@ def init_dtype_type(self): self.accumulate = True +class TestIndexPutAPI31(TestIndexPutAPIBase): + def init_dtype_type(self): + self.dtype_np = np.bool_ + self.index_type_np = np.int32 + self.x_shape = (100, 110) + self.indices_shapes = [(21,), (21,)] + self.value_shape = (21,) + self.dtype_pd = paddle.bool + self.index_type_pd = paddle.int32 + self.accumulate = False + self.is_all_false = True + + +class TestIndexPutAPI32(TestIndexPutAPIBase): + def init_dtype_type(self): + self.dtype_np = np.bool_ + self.index_type_np = np.int32 + self.x_shape = (100, 110) + self.indices_shapes = [(21,), (21,)] + self.value_shape = (21,) + self.dtype_pd = paddle.bool + self.index_type_pd = paddle.int32 + self.accumulate = True + self.is_all_false = True + + class TestIndexPutInplaceAPI(unittest.TestCase): def setUp(self): self.init_dtype_type() @@ -572,7 +609,7 @@ def setUp(self): self.x_np = np.random.random(self.x_shape).astype(self.dtype_np) self.value_np = np.random.random(self.value_shape).astype(self.dtype_np) self.indices_np = gen_indices_np( - self.x_shape, self.indices_shapes, self.index_type_np + self.x_shape, self.indices_shapes, self.index_type_np, False ) def init_dtype_type(self): @@ -678,7 +715,7 @@ def test_backward(self): atol=1e-7, ) - def test_backwardScalarVal(self): + def test_backward_scalarval(self): paddle.disable_static() for place in self.place: paddle.device.set_device(place) @@ -719,7 +756,7 @@ def test_backwardScalarVal(self): np.array([4.0], dtype=np.float64), dvalue.numpy(), atol=1e-7 ) - def test_backwardBroadCastValue(self): + def test_backward_broadcastvalue(self): paddle.disable_static() for place in self.place: paddle.device.set_device(place) @@ -764,7 +801,7 @@ def test_backwardBroadCastValue(self): atol=1e-7, ) - def test_backwardBroadCastValue1(self): + def test_backward_broadcastvalue1(self): paddle.disable_static() for place in self.place: paddle.device.set_device(place) @@ -809,7 +846,7 @@ def test_backwardBroadCastValue1(self): atol=1e-7, ) - def test_backwardBroadCastValue2(self): + def test_backward_broadcastvalue2(self): paddle.disable_static() for place in self.place: paddle.device.set_device(place) @@ -854,6 +891,50 @@ def test_backwardBroadCastValue2(self): atol=1e-7, ) + def test_backward_all_false_bool_indice(self): + paddle.disable_static() + for place in self.place: + paddle.device.set_device(place) + value = paddle.ones(shape=[2, 1], dtype=paddle.float64) + x = paddle.ones(shape=[16, 21], dtype=paddle.float64) + ix = paddle.zeros(shape=[16, 21], dtype=paddle.bool) + + value.stop_gradient = False + x.stop_gradient = False + out = paddle.index_put(x, (ix,), value, False) + + dx, dvalue = paddle.grad( + outputs=[out], + inputs=[x, value], + create_graph=False, + retain_graph=True, + ) + ref_dx = np.ones(shape=[16, 21], dtype=np.float64) + + np.testing.assert_allclose(ref_dx, dx.numpy(), atol=1e-7) + np.testing.assert_allclose( + np.array([[0.0], [0.0]], dtype=np.float64), + dvalue.numpy(), + atol=1e-7, + ) + + out = paddle.index_put(x, (ix,), value, True) + + dx, dvalue = paddle.grad( + outputs=[out], + inputs=[x, value], + create_graph=False, + retain_graph=True, + ) + ref_dx = np.ones(shape=[16, 21], dtype=np.float64) + + np.testing.assert_allclose(ref_dx, dx.numpy(), atol=1e-7) + np.testing.assert_allclose( + np.array([[0.0], [0.0]], dtype=np.float64), + dvalue.numpy(), + atol=1e-7, + ) + def test_backward_in_static(self): paddle.enable_static() exe = paddle.static.Executor() From f9e1b2d2ad1d9964a80d20df72609ef4d18f83fe Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 25 Jul 2023 15:05:40 +0800 Subject: [PATCH 32/34] [NewIR]Support Instruction.Run in CINN for Runtime::Program (#55680) --- paddle/cinn/hlir/framework/new_ir_compiler.h | 53 +++++++++++++------ .../cpp/ir/cinn/graph_compiler_new_ir_test.cc | 20 +++++-- 2 files changed, 53 insertions(+), 20 deletions(-) diff --git a/paddle/cinn/hlir/framework/new_ir_compiler.h b/paddle/cinn/hlir/framework/new_ir_compiler.h index c92e84563e7aa..fc4944d1ca241 100644 --- a/paddle/cinn/hlir/framework/new_ir_compiler.h +++ b/paddle/cinn/hlir/framework/new_ir_compiler.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include "paddle/cinn/common/context.h" @@ -30,9 +31,15 @@ namespace cinn { namespace hlir { namespace framework { -// TODO(Aurelius): Need add name mapping logic in REGISTER_CINN_OP -// macros or attempt to unify Op name with Paddle and CINN. -static const std::unordered_map OP_NAMES = { +struct CompatibleInfo { + static constexpr char* kInputPrefix = "input_"; + static constexpr char* kOutputPrefix = "output_"; + // TODO(Aurelius): Need add name mapping logic in REGISTER_CINN_OP + // macros or attempt to unify Op name with Paddle and CINN. + static const std::unordered_map OP_NAMES; +}; + +const std::unordered_map CompatibleInfo::OP_NAMES = { {"pd.full", "fill_constant"}, {"pd.matmul", "matmul"}}; // TODO(Aurelius84): Need abstract this logic to implement Proxy for @@ -70,18 +77,32 @@ class NewIRCompiler final { compiler_->Build(build_module, ""); auto instructions = BuildInstructions(groups); + + // TODO(Aurelius84): Instantiate all tensors on compile-time, which is + // controlled by 'options.with_instantiate_variables' in GraphCompiler. + // Moreover, it's better to implement InsertBufferHandlers() logic + // to automatically insert Malloc and Free instructions. + for (auto& name : scope_->var_names()) { + std::string var_name({name.data(), name.size()}); + VLOG(4) << "Instantiate " << var_name << " on compile-time"; + auto* var = scope_->Var(var_name); + auto& tensor = absl::get(*var); + tensor->mutable_data(target_, tensor->type()); + } return std::make_unique(scope_, std::move(instructions)); } std::vector GetOpFunc(const ::ir::Operation& op, int idx) { std::vector inputs; std::vector cinn_inputs; - VLOG(4) << "GetOpFunc for op: " << op.name(); + auto op_name = op.name(); + VLOG(4) << "GetOpFunc for op: " << op_name; // step 1: Deal with Oprands for (int i = 0; i < op.num_operands(); ++i) { auto in_value = op.operand(i); // TODO(Aurelius84): For now, use addr as name but it's not wise. - std::string input_id = std::to_string(std::hash<::ir::Value>()(in_value)); + std::string input_id = CompatibleInfo::kInputPrefix + + std::to_string(std::hash<::ir::Value>()(in_value)); // NOTE(Aurelius84): whether need to support other Type? auto type_info = in_value.type().dyn_cast(); @@ -100,8 +121,7 @@ class NewIRCompiler final { cinn_inputs.push_back(common::CINNValue(temp)); } for (auto out_name : OpGetOutputNames(op)) { - cinn_inputs.push_back( - common::CINNValue(op.name().substr(3) + "_" + out_name)); + cinn_inputs.push_back(common::CINNValue(out_name)); } VLOG(4) << "inputs.size(): " << inputs.size(); @@ -124,14 +144,14 @@ class NewIRCompiler final { { VLOG(4) << "op.attributes():" << op.attributes().size(); auto attrs = utils::ConvertAttributes(op.attributes()); - node_attrs.node_name = OP_NAMES.at(op.name()); + node_attrs.node_name = CompatibleInfo::OP_NAMES.at(op_name); node_attrs.attr_store = std::move(attrs); } auto& strategy = Operator::GetAttrs("CINNStrategy"); // NOTE(Aurelius84): Do we need replace all hlir::framework Operator with // ::ir::Program ? const hlir::framework::Operator* cinn_op = - Operator::Get(OP_NAMES.at(op.name())); + Operator::Get(CompatibleInfo::OP_NAMES.at(op_name)); auto impl = OpStrategy::SelectImpl( strategy[cinn_op](node_attrs, inputs, out_types, out_shapes, target_)); common::CINNValuePack C = @@ -223,7 +243,8 @@ class NewIRCompiler final { std::unordered_set repeat; for (int i = 0; i < op.num_operands(); ++i) { auto value = op.operand(i); - std::string name = std::to_string(std::hash<::ir::Value>()(value)); + std::string name = CompatibleInfo::kInputPrefix + + std::to_string(std::hash<::ir::Value>()(value)); if (repeat.count(name)) { continue; } @@ -237,7 +258,8 @@ class NewIRCompiler final { std::vector names; for (int i = 0; i < op.num_results(); ++i) { auto value = op.result(i); - std::string name = std::to_string(std::hash<::ir::Value>()(value)); + std::string name = CompatibleInfo::kOutputPrefix + + std::to_string(std::hash<::ir::Value>()(value)); names.push_back(std::move(name)); } return names; @@ -257,11 +279,12 @@ std::shared_ptr BuildScope(const Target& target, std::unordered_set<::ir::Value> visited; auto scope = std::make_shared(); - auto create_var = [&](::ir::Value value) { + auto create_var = [&](const std::string& name_prefix, ::ir::Value value) { if (visited.count(value) > 0) return; visited.emplace(value); - std::string name = std::to_string(std::hash<::ir::Value>()(value)); + std::string name = + name_prefix + std::to_string(std::hash<::ir::Value>()(value)); auto type_info = value.type().dyn_cast(); auto* var = scope->Var(name); auto& tensor = absl::get(*var); @@ -279,12 +302,12 @@ std::shared_ptr BuildScope(const Target& target, // visit OpOprands for (auto i = 0; i < (*it)->num_operands(); ++i) { auto in_value = (*it)->operand(i); - create_var(in_value); + create_var(CompatibleInfo::kInputPrefix, in_value); } for (auto i = 0; i < (*it)->num_results(); ++i) { auto out_value = (*it)->result(i); - create_var(out_value); + create_var(CompatibleInfo::kOutputPrefix, out_value); } } return scope; diff --git a/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc b/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc index a5f04d257357a..42ef6fe53d059 100644 --- a/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc +++ b/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc @@ -24,6 +24,7 @@ #include "paddle/cinn/frontend/net_builder.h" #include "paddle/cinn/frontend/optimize.h" #include "paddle/cinn/hlir/framework/graph_compiler.h" +#include "paddle/cinn/utils/data_util.h" #include "paddle/cinn/hlir/framework/new_ir_compiler.h" @@ -33,15 +34,16 @@ TEST(GraphCompier, TestNewIR) { ::ir::Program program(ctx); ::ir::Builder builder = ::ir::Builder(ctx, program.block()); + const float value = 2.0; auto full_op_x = builder.Build(std::vector{64, 128}, - 1.0, + value, phi::DataType::FLOAT32, phi::CPUPlace()); auto full_op_y = builder.Build(std::vector{128, 64}, - 2.0, + value, phi::DataType::FLOAT32, phi::CPUPlace()); // TODO(Aurelius84): test more op @@ -61,7 +63,15 @@ TEST(GraphCompier, TestNewIR) { cinn::hlir::framework::NewIRCompiler ir_compiler(program, target, scope); auto runtime_program = ir_compiler.Build(); - // FIXME(Aurelius84): It raised illegal memory access while deconstructor - // after running all instruction, but it's ok under GLOG_v=10. - // ASSERT_NO_THROW(runtime_program->Execute()); + ASSERT_NO_THROW(runtime_program->Execute()); + + for (auto& var_name : scope->var_names()) { + std::string name = {var_name.begin(), var_name.end()}; + std::vector data = + cinn::GetTensorData(scope->GetTensor(name), target); + for (int i = 0; i < data.size(); ++i) { + LOG_FIRST_N(INFO, 3) << "data: " << data[i]; + ASSERT_NEAR(data[i], value, 1e-5); + } + } } From 017a6164d934b001c25cb8c454345ad449747c24 Mon Sep 17 00:00:00 2001 From: Jeng Bai-Cheng Date: Tue, 25 Jul 2023 15:24:05 +0800 Subject: [PATCH 33/34] Bugfix, fast layer norm, OOB (#55639) * Fix LayerNormForward perf issue * Bugfix, fast_layer_norm OOB * apply pre-commit --------- Co-authored-by: Shijie Wang --- paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 21 +++++++++++++++---- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 6 +++--- test/legacy_test/test_layer_norm_op.py | 8 ++++++- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h index 1d067b0fc2918..e2d908b853188 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h @@ -217,8 +217,13 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - phi::Load(gamma_ptr + col * VecSize, &gamma[it]); - phi::Load(beta_ptr + col * VecSize, &beta[it]); + if (col < cols) { + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); + } else { + gamma[it] = Vec_scale{}; + beta[it] = Vec_scale{}; + } col += THREADS_PER_ROW; } @@ -227,7 +232,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( Vec x[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - phi::Load(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]); + if (col < cols) { + phi::Load(x_ptr + row * ELTS_PER_ROW + col * VecSize, + &x[it]); + } else { + x[it] = Vec{}; + } col += THREADS_PER_ROW; } U xf[LDGS * VecSize]; @@ -324,7 +334,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - phi::Store(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize); + if (col < cols) { + phi::Store(x[it], + y_ptr + row * ELTS_PER_ROW + col * VecSize); + } col += THREADS_PER_ROW; } } diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 34425d8cfcfe2..c5bb0c288f260 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -578,7 +578,8 @@ void LayerNormKernel(const Context &dev_ctx, VecSize, \ WARPS_M, \ WARPS_N, \ - BYTES_PER_LDG> \ + BYTES_PER_LDG, \ + feature_size> \ <<>>( \ batch_size, \ feature_size, \ @@ -605,8 +606,7 @@ void LayerNormKernel(const Context &dev_ctx, if ((feature_size >= 768 && feature_size <= 2048 && feature_size % 256 == 0 || feature_size == 4096) && scale != nullptr && bias != nullptr) { - // can_call_fast_kernel = true; - can_call_fast_kernel = false; + can_call_fast_kernel = true; } if (can_call_fast_kernel) { diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py index 32d23ad3e1c72..4af8c68346230 100644 --- a/test/legacy_test/test_layer_norm_op.py +++ b/test/legacy_test/test_layer_norm_op.py @@ -515,7 +515,13 @@ def setUp(self): self.use_cudnn = True def __assert_close(self, tensor, np_array, msg, atol=1e-4): - self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + np.testing.assert_allclose( + np.array(tensor).flatten(), + np_array.flatten(), + rtol=1e-3, + atol=atol, + err_msg=msg, + ) def check_forward_backward( self, From 8db3ff1f8daa12f9cdde98a2d95a2134ea5b61d7 Mon Sep 17 00:00:00 2001 From: lishicheng1996 <43111799+lishicheng1996@users.noreply.github.com> Date: Tue, 25 Jul 2023 16:15:06 +0800 Subject: [PATCH 34/34] fix a bug caused by hipcc lambda value capture (#55612) --- paddle/phi/kernels/funcs/blas/blas_impl.hip.h | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h index 6aa41e4f4a2b6..805a718ab85ed 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h @@ -1173,6 +1173,56 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, }); } +template <> +template <> +inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, + int M, + int N, + int K, + float16 alpha, + const float16 *A, + const float16 *B, + float16 beta, + float16 *C, + int batchCount, + int64_t strideA, + int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + const int64_t strideC = M * N; + context_.CublasCall([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm_strided_batched( + handle, + cuTransB, + cuTransA, + N, + M, + K, + reinterpret_cast(&alpha), + reinterpret_cast(B), + ldb, + strideB, + reinterpret_cast(A), + lda, + strideA, + reinterpret_cast(&beta), + reinterpret_cast(C), + ldc, + strideC, + batchCount)); + }); +} + // note(wangran16): unknown bug. parameters dislocation when calling // GEMM_STRIDED_BATCH and GEMM_STRIDED_BATCH template <>