From db921ae95ec264479a34a0026f9dba3cbc586733 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Sat, 22 Jul 2023 18:19:33 +0800
Subject: [PATCH 01/34] Fix launch error when PADDLE_TRAINER_ENDPOINTS is too
 long (#55478)

* fix new launch

* fix ps uit
---
 python/paddle/distributed/backup_env.py       | 35 +++++++++++++++++++
 .../distributed/fleet/base/role_maker.py      |  8 +++--
 .../distributed/fleet/elastic/manager.py      |  4 ++-
 .../launch/controllers/controller.py          |  3 ++
 .../launch/utils/process_context.py           | 26 +++++++++++++-
 python/paddle/distributed/parallel.py         |  9 +++--
 python/paddle/fluid/executor.py               |  3 +-
 test/legacy_test/test_run.py                  |  1 +
 8 files changed, 81 insertions(+), 8 deletions(-)
 create mode 100644 python/paddle/distributed/backup_env.py

diff --git a/python/paddle/distributed/backup_env.py b/python/paddle/distributed/backup_env.py
new file mode 100644
index 0000000000000..60428b9a2025d
--- /dev/null
+++ b/python/paddle/distributed/backup_env.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+g_backup_envs = None
+
+
+def getenv_or_backup(name, default=None):
+    global g_backup_envs
+    if g_backup_envs is None:
+        backup_path = os.getenv('PADDLE_BACKUP_ENV_PATH')
+        if backup_path is None:
+            g_backup_envs = {}
+        else:
+            with open(backup_path, 'r') as f:
+                g_backup_envs = json.load(f)
+
+    value = os.getenv(name)
+    if value is not None:
+        return value
+    else:
+        return g_backup_envs.get(name, default)
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 3f2b22d8795c2..113a0132f4c12 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -25,6 +25,8 @@
 )
 from paddle.fluid import core
 
+from ...backup_env import getenv_or_backup
+
 __all__ = []
 
 
@@ -844,7 +846,9 @@ def _ps_env(self):  # each role will execute it
 
         self._server_endpoints = self._server_endpoints.split(",")
 
-        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", None)
+        self._worker_endpoints = getenv_or_backup(
+            "PADDLE_TRAINER_ENDPOINTS", None
+        )
         if self._worker_endpoints is not None:
             self._worker_endpoints = self._worker_endpoints.split(",")
         else:
@@ -1066,7 +1070,7 @@ def _collective_env(self):
         self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         assert self._training_role == "TRAINER"
         self._role = Role.WORKER
-        self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+        self._worker_endpoints = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS")
         self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
         if self._worker_endpoints is None:
             # back to non_distributed execution.
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index 5e0de5c3120e3..00151a8dee5f1 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -25,6 +25,8 @@
 from paddle.distributed.fleet import cloud_utils, launch_utils
 from paddle.distributed.utils.log_utils import get_logger
 
+from ...backup_env import getenv_or_backup
+
 logger = get_logger("INFO", "ELASTIC")
 
 ELASTIC_EXIT_CODE = 101
@@ -149,7 +151,7 @@ def __init__(self, args, etcd_client):
             self.np = len(self.trainers.split(","))
             self.start_port = int(os.getenv("PADDLE_PORT", "6170"))
             self.dist_endpoints = os.getenv('DISTRIBUTED_TRAINER_ENDPOINTS', '')
-            trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS', '')
+            trainer_endpoints = getenv_or_backup('PADDLE_TRAINER_ENDPOINTS', '')
             self.trainer_endpoints_list = trainer_endpoints.split(",")
         else:
             self.trainers = args.ips or os.getenv('PADDLE_TRAINERS', '')
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 9769ec9d6bf3f..25becbba6f329 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import os
 import signal
 import sys
@@ -244,6 +245,8 @@ def add_container(
         is_init=False,
     ):
         if not container:
+            envs = copy.deepcopy(envs)
+            envs['PADDLE_LOG_DIR'] = str(os.path.abspath(self.ctx.args.log_dir))
             container = self.new_container(
                 entrypoint=entrypoint, envs=envs, out=log_file, err=log_file
             )
diff --git a/python/paddle/distributed/launch/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py
index 6543d7bd9ebae..8b14d5417a68b 100644
--- a/python/paddle/distributed/launch/utils/process_context.py
+++ b/python/paddle/distributed/launch/utils/process_context.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import os
 import signal
 import subprocess
 import sys
 import time
 
+LIMIT_LEN_ENVS = ["TRAINER_IP_PORT_LIST", "PADDLE_TRAINER_ENDPOINTS"]
+
 
 class ProcessContext:
     def __init__(
@@ -42,9 +45,30 @@ def __init__(
 
     def _start(self):
         pre_fn = os.setsid if self._group else None
+        log_dir = self._env["PADDLE_LOG_DIR"]
+        os.makedirs(log_dir, exist_ok=True)
+
+        rank = self._env.get("PADDLE_TRAINER_ID")
+        if rank is not None:
+            rank = int(rank)
+            backup_env_path = str(
+                os.path.join(log_dir, f'backup_env.{rank}.json')
+            )
+            envs = {"PADDLE_BACKUP_ENV_PATH": backup_env_path}
+
+            max_len = int(os.getenv('PADDLE_ENV_LIMIT_LEN', 48000))
+            for k, v in self._env.items():
+                if k not in LIMIT_LEN_ENVS or len(v) < max_len:
+                    envs[k] = v
+
+            with open(backup_env_path, 'w') as f:
+                json.dump(dict(self._env), f, indent=4, sort_keys=True)
+        else:
+            envs = self._env
+
         self._proc = subprocess.Popen(
             self._cmd,
-            env=self._env,
+            env=envs,
             stdout=self._stdout,
             stderr=self._stderr,
             preexec_fn=self._preexec_fn or pre_fn,
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index cc6ab5384ca4e..a34807d2b7377 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -56,6 +56,7 @@
 from paddle.utils import deprecated
 
 from . import parallel_helper
+from .backup_env import getenv_or_backup
 
 __all__ = []
 
@@ -704,7 +705,7 @@ def __init__(self):
                 selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
                 self._device_id = int(selected_xpus[0])
 
-        self._trainer_endpoints = os.getenv(
+        self._trainer_endpoints = getenv_or_backup(
             "PADDLE_TRAINER_ENDPOINTS", ""
         ).split(",")
         self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
@@ -878,7 +879,7 @@ def _is_cpuonly(backend):
 
 
 def _check_var_exists(var_name):
-    var = os.environ.get(var_name, None)
+    var = getenv_or_backup(var_name, None)
     if var is None:
         raise ValueError(
             "paddle.distributed initialize error, "
@@ -1060,7 +1061,9 @@ def train():
         if endpoints is None:
             endpoints = os.getenv("PADDLE_MASTER", None)
         if endpoints is None:
-            endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0]
+            endpoints = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS").split(',')[
+                0
+            ]
         assert endpoints, (
             "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
             "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index b3deb787960e6..a9afe7f5c8d0d 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -525,8 +525,9 @@ def _to_str(var):
 
 def _prepare_fleet_executor():
     from ..distributed.fleet.proto import fleet_executor_desc_pb2
+    from ..distributed.backup_env import getenv_or_backup
 
-    trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "")
+    trainer_endpoints_str = getenv_or_backup("PADDLE_TRAINER_ENDPOINTS", "")
     trainer_endpoints = trainer_endpoints_str.split(',')
     fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
     cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
diff --git a/test/legacy_test/test_run.py b/test/legacy_test/test_run.py
index 467b9ef35c67b..3174dd7005ce6 100644
--- a/test/legacy_test/test_run.py
+++ b/test/legacy_test/test_run.py
@@ -55,6 +55,7 @@ def get_files(pth, prefix):
         if isfile(join(pth, f))
         and not f.endswith('gpu.log')
         and not f.startswith('envlog')
+        and not f.startswith('backup_env')
     ]
 
 

From 6da9db501aaabd166c6591c5a7608df746aba226 Mon Sep 17 00:00:00 2001
From: liuzhenhai93 <liuzhenhai93@outlook.com>
Date: Sat, 22 Jul 2023 22:16:57 +0800
Subject: [PATCH 02/34] fix group_shard3_get_all_parameter (#55572)

---
 .../sharding/group_sharded_stage3.py          | 45 +++++++++++--------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index f6b86ce736d78..b9ca53aeef0a1 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -651,6 +651,11 @@ def get_all_parameters(self, convert2cpu=False):
             for param in trainable_params:
                 t_flow.full_param[param.name][0]._share_buffer_to(param)
 
+        #  a _allgather_buffer call should be matched with a _release_param call later,
+        #  but the _allgather_buffer call here has no match.
+        #  TODO(liuzhenhai):  set a flag here and release full param before forward pass of the first layer,
+        #  when _allgather_buffer is called for get_all_parameters and convert2cpu is false
+
         self._optim._parameter_list = self._ori_parameter_list
         self._optim._param_groups = self._ori_param_groups
 
@@ -924,14 +929,11 @@ class TaskFlow:
 
     def __init__(
         self,
-        full_param={},
-        full_grad={},
-        use_calc={},
         callback=None,
     ):
-        self.full_param = full_param
-        self.full_grad = full_grad
-        self.use_calc = use_calc
+        self.full_param = {}
+        self.full_grad = {}
+        self.use_calc = {}
         self.callback = callback
 
 
@@ -1004,6 +1006,9 @@ def _allgather_buffer(
     offload=False,
     convert2cpu=False,
 ):
+    if convert2cpu:
+        assert sync_wait
+
     for param in trainable_params:
         if param.status == "all":
             param.use_count += 1
@@ -1020,20 +1025,22 @@ def _allgather_buffer(
         if sync_wait:
             with paddle.amp.auto_cast(enable=False):
                 task.wait()
-            full_param._slice(0, param._numel())._share_buffer_to(param)
-            param.fw_storage._clear()
-            param.fw_storage = None
-            param.status = "all"
-            param.use_count += 1
+            if convert2cpu:
+                # status is not changed
+                cpu_full_param = _device2cpu(
+                    full_param._slice(0, param._numel())
+                )
+                full_param._clear_data()
+                del full_param
+                full_param = cpu_full_param
+                task = None
+            else:
+                full_param._slice(0, param._numel())._share_buffer_to(param)
+                param.fw_storage._clear()
+                param.fw_storage = None
+                param.status = "all"
+                param.use_count += 1
         task_flow.full_param[param.name] = (full_param, task)
-
-        # parameter converts to cpu
-        if convert2cpu:
-            p_name = param.name
-            param = _device2cpu(param)
-            del task_flow.full_param[p_name]
-            task_flow.full_param[p_name] = (param, None)
-
     return task_flow
 
 

From 2b8e62859b8f83d646c6a53c3572f88440cdc0bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Mon, 24 Jul 2023 08:26:53 +0800
Subject: [PATCH 03/34] =?UTF-8?q?[IR=20Dialect]=20=E2=9A=94Elden=20chapter?=
 =?UTF-8?q?=201.1=E2=9A=94=20=20(#55525)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* IntArrayAttributeStorage
---
 paddle/fluid/ir/dialect/pd_attribute.cc        | 4 +++-
 paddle/fluid/ir/dialect/pd_attribute.h         | 2 +-
 paddle/fluid/ir/dialect/pd_attribute_storage.h | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/ir/dialect/pd_attribute.cc b/paddle/fluid/ir/dialect/pd_attribute.cc
index 78a71013b87b5..687e836dc70f3 100644
--- a/paddle/fluid/ir/dialect/pd_attribute.cc
+++ b/paddle/fluid/ir/dialect/pd_attribute.cc
@@ -16,7 +16,9 @@
 
 namespace paddle {
 namespace dialect {
-phi::IntArray IntArrayAttribute::data() const { return storage()->GetAsKey(); }
+const phi::IntArray& IntArrayAttribute::data() const {
+  return storage()->GetAsKey();
+}
 
 phi::DataType DataTypeAttribute::data() const { return storage()->GetAsKey(); }
 
diff --git a/paddle/fluid/ir/dialect/pd_attribute.h b/paddle/fluid/ir/dialect/pd_attribute.h
index 7d921ebd1df1b..5af73b2c0f48b 100644
--- a/paddle/fluid/ir/dialect/pd_attribute.h
+++ b/paddle/fluid/ir/dialect/pd_attribute.h
@@ -33,7 +33,7 @@ class IntArrayAttribute : public ir::Attribute {
     return storage() < right.storage();
   }
 
-  phi::IntArray data() const;
+  const phi::IntArray &data() const;
 };
 
 class ScalarAttribute : public ir::Attribute {
diff --git a/paddle/fluid/ir/dialect/pd_attribute_storage.h b/paddle/fluid/ir/dialect/pd_attribute_storage.h
index 78bf9beb089d7..1877e5043fc65 100644
--- a/paddle/fluid/ir/dialect/pd_attribute_storage.h
+++ b/paddle/fluid/ir/dialect/pd_attribute_storage.h
@@ -48,7 +48,7 @@ struct IntArrayAttributeStorage : public ir::AttributeStorage {
            (data_.FromTensor() == key.FromTensor());
   }
 
-  ParamKey GetAsKey() const { return ParamKey(data_); }
+  const ParamKey &GetAsKey() const { return data_; }
 
  private:
   phi::IntArray data_;

From bd60757d03f062583b95c966e84036985c830f3b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 24 Jul 2023 10:35:46 +0800
Subject: [PATCH 04/34] [AutoParallel] Add shard tensor and DistAttr api
 (#55494)

* add shard tensor api

* add DistAttr api

* add unittest for coverage

* fix process mesh sample code

* fix checking error
---
 python/paddle/distributed/__init__.py         |   8 +-
 .../paddle/distributed/auto_parallel/api.py   | 126 ++++++++++++++++++
 .../distributed/auto_parallel/process_mesh.py |  10 +-
 test/auto_parallel/CMakeLists.txt             |   1 +
 test/auto_parallel/test_dist_tensor.py        |   8 +-
 test/auto_parallel/test_shard_tensor_api.py   |  78 +++++++++++
 6 files changed, 226 insertions(+), 5 deletions(-)
 create mode 100644 python/paddle/distributed/auto_parallel/api.py
 create mode 100644 test/auto_parallel/test_shard_tensor_api.py

diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 8f6237bfa4c4b..183f307607c36 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -61,8 +61,11 @@
     get_backend,
 )  # noqa: F401
 
+from .auto_parallel.process_mesh import ProcessMesh  # noqa: F401
+from .auto_parallel.api import DistAttr  # noqa: F401
+
 from .auto_parallel import shard_op  # noqa: F401
-from .auto_parallel import shard_tensor  # noqa: F401
+from .auto_parallel.api import shard_tensor  # noqa: F401
 
 from .fleet import BoxPSDataset  # noqa: F401
 
@@ -120,4 +123,7 @@
     "reduce_scatter",
     "is_available",
     "get_backend",
+    "ProcessMesh",
+    "DistAttr",
+    "shard_tensor",
 ]
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
new file mode 100644
index 0000000000000..b25799d058ad2
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -0,0 +1,126 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.framework import core
+
+# There are the auto parallel API of the unified version of dynamic and static mode.
+# Some APIs have the same name with the previous APIs implementation, which are
+# a temporary state, and the APIs here will eventually be used.
+
+
+class DistAttr(core.TensorDistAttr):
+    """
+    DistAttr specifies how tensors are distributed or sliced on ProcessMesh.
+
+    Args:
+        mesh(paddle.distributed.ProcessMesh): The `ProcessMesh` object describes the Cartesian topology of the used processes.
+        sharding_specs(list[str|None]): The specification describing how to shard the Tensor.
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import paddle.distributed as dist
+
+        mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
+        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+
+        print(dist_attr)
+    """
+
+    def __init__(self, mesh, sharding_specs):
+        # 1. inputs checking
+        if not isinstance(mesh, ProcessMesh):
+            raise ValueError(
+                "The mesh must be an instance of paddle.distributed.ProcessMesh."
+            )
+        if not isinstance(sharding_specs, list):
+            raise ValueError("The sharding_specs must be an instance of list.")
+        assert all(
+            isinstance(dim_name, str) or dim_name is None
+            for dim_name in sharding_specs
+        ), 'The dimension name in sharding_specs must be an instance of str.'
+
+        dims_mapping = [
+            mesh.dim_names.index(dim_name) if dim_name is not None else -1
+            for dim_name in sharding_specs
+        ]
+
+        # 2. init core.TensorDistAttr
+        core.TensorDistAttr.__init__(self)
+        self.process_mesh = mesh
+        self.dims_mapping = dims_mapping
+
+
+def shard_tensor(
+    data, dtype=None, place=None, stop_gradient=True, dist_attr=None
+):
+    """
+    Constructs a ``paddle.Tensor`` with distributed attributes from ``data``,
+    which can scalar, tuple, list, numpy.ndarray, paddle.Tensor.
+
+    If the ``data`` is already a Tensor, transform it to a Distributed Tensor.
+
+    Args:
+        data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
+            Can be a scalar, list, tuple, numpy.ndarray, paddle.Tensor.
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
+            'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
+            except for python float number which gets dtype from ``get_default_type`` .
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
+        stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
+        dist_attr(paddle.distributed.DistAttr): Specify how tensors are distributed or sliced on ProcessMesh.
+
+    Returns:
+        Tensor: A Tensor constructed from ``data`` with distributed attributes.
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle
+        import paddle.distributed as dist
+
+        mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
+        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+
+        # dense tensor
+        a = paddle.to_tensor([[1,2,3],
+                              [5,6,7]])
+        # distributed tensor
+        d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
+
+        print(d_tensor)
+    """
+    # 1. create dense tensor
+    # `paddle.to_tensor` supports both dynamic and static mode
+    data = paddle.to_tensor(data)
+
+    # 2. create dist tensor
+    assert len(dist_attr.dims_mapping) == len(
+        list(data.shape)
+    ), "The length of sharding_specs must be same as the shape of the input tensor."
+
+    if paddle.in_dynamic_mode():
+        return paddle.Tensor(data, dist_attr=dist_attr)
+    else:
+        raise NotImplementedError(
+            "The `paddle.distributed.shard_tensor` for static mode will be implemented later."
+        )
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index 1c2f292e5f861..a6ad3355d7d2c 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -82,8 +82,9 @@ class ProcessMesh(core.ProcessMesh):
         .. code-block:: python
 
             import paddle
+            import paddle.distributed as dist
 
-            mesh = auto.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
+            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
             assert mesh.shape == [2, 3]
             assert mesh.process_ids == [2, 4, 5, 0, 1, 3]
 
@@ -162,6 +163,13 @@ def mesh(self):
         """
         return self._mesh
 
+    @property
+    def dim_names(self):
+        """
+        Get the underlying dimension names of ProcessMesh.
+        """
+        return self._dim_names
+
     @property
     def unique_id(self):
         """
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index fe67e34c03d39..83c2ae84182fb 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -153,6 +153,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_engine_save_load MODULES test_engine_save_load)
   py_test_modules(test_rule_based_tuner MODULES test_rule_based_tuner)
   py_test_modules(test_dist_tensor MODULES test_dist_tensor)
+  py_test_modules(test_shard_tensor_api MODULES test_shard_tensor_api)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/test_dist_tensor.py b/test/auto_parallel/test_dist_tensor.py
index 58ebc0850046b..61705a322e2d6 100644
--- a/test/auto_parallel/test_dist_tensor.py
+++ b/test/auto_parallel/test_dist_tensor.py
@@ -17,20 +17,22 @@
 import numpy as np
 
 import paddle
+import paddle.distributed as dist
 
 
 class TestDistTensor(unittest.TestCase):
     def test_dist_tensor_creation(self):
         shape = [10, 5]
-        dist_attr = paddle.fluid.core.TensorDistAttr()
+        mesh = dist.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
 
         # create dist tensor using numpy
-        dist_tensor_with_numpy = paddle.Tensor(
+        dist_tensor_with_numpy = dist.shard_tensor(
             np.ones(shape, dtype=np.float32), dist_attr=dist_attr
         )
 
         # create dist tensor using tensor
-        dist_tensor_with_tensor = paddle.Tensor(
+        dist_tensor_with_tensor = dist.shard_tensor(
             paddle.ones(shape), dist_attr=dist_attr
         )
 
diff --git a/test/auto_parallel/test_shard_tensor_api.py b/test/auto_parallel/test_shard_tensor_api.py
new file mode 100644
index 0000000000000..124c7dc7ba39e
--- /dev/null
+++ b/test/auto_parallel/test_shard_tensor_api.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestDistAttrBasic(unittest.TestCase):
+    def test_mesh_argument_error(self):
+        exception = None
+        try:
+            mesh = [[0, 1], [2, 3]]
+            dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+        except ValueError as ex:
+            self.assertIn(
+                "The mesh must be an instance of paddle.distributed.ProcessMesh",
+                str(ex),
+            )
+            exception = ex
+
+        self.assertIsNotNone(exception)
+
+    def test_sharding_specs_argument_error(self):
+        exception = None
+        try:
+            mesh = dist.ProcessMesh(
+                [[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"]
+            )
+            dist_attr = dist.DistAttr(
+                mesh=mesh, sharding_specs={"x": 0, "y": 1}
+            )
+        except ValueError as ex:
+            self.assertIn(
+                "The sharding_specs must be an instance of list", str(ex)
+            )
+            exception = ex
+
+        self.assertIsNotNone(exception)
+
+
+class TestShardTensorBasic(unittest.TestCase):
+    # remove this test after static mode is supported
+    def test_static_mode_unimplemented(self):
+        exception = None
+        try:
+            paddle.enable_static()
+            mesh = dist.ProcessMesh(
+                [[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"]
+            )
+            dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x', 'y'])
+            a = paddle.to_tensor([[1, 2, 3], [5, 6, 7]])
+            d_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
+        except NotImplementedError as ex:
+            self.assertIn(
+                "The `paddle.distributed.shard_tensor` for static mode will be implemented later",
+                str(ex),
+            )
+            exception = ex
+            paddle.disable_static()
+
+        self.assertIsNotNone(exception)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 1f3e6ec49ed4eacef4cf397491a9dc4a32f7a0ae Mon Sep 17 00:00:00 2001
From: xingmingyyj <135400902+xingmingyyj@users.noreply.github.com>
Date: Mon, 24 Jul 2023 10:37:14 +0800
Subject: [PATCH 05/34] Order print attribute map (#55518)

* fix_ir_printer

* Update ir_printer.cc

* Update ir_printer.cc

* Update ir_printer.cc

* Update ir_printer.cc

* Update ir_printer.cc

* Update paddle/ir/core/ir_printer.cc

Co-authored-by: kangguangli <kangguangli@hotmail.com>

* Update ir_printer.cc

---------

Co-authored-by: kangguangli <kangguangli@hotmail.com>
---
 paddle/ir/core/ir_printer.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/ir/core/ir_printer.cc b/paddle/ir/core/ir_printer.cc
index a322e8fca9ffd..8890364565518 100644
--- a/paddle/ir/core/ir_printer.cc
+++ b/paddle/ir/core/ir_printer.cc
@@ -215,11 +215,14 @@ void IrPrinter::PrintOpResult(const Operation* op) {
 }
 
 void IrPrinter::PrintAttributeMap(const Operation* op) {
+  AttributeMap attributes = op->attributes();
+  std::map<std::string, Attribute, std::less<std::string>> order_attributes(
+      attributes.begin(), attributes.end());
   os << " {";
 
   PrintInterleave(
-      op->attributes().begin(),
-      op->attributes().end(),
+      order_attributes.begin(),
+      order_attributes.end(),
       [this](std::pair<std::string, Attribute> it) {
         this->os << it.first;
         this->os << ":";

From cf76e7ae7cf6d26fc340ff0d3677870182688cd1 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Mon, 24 Jul 2023 10:46:37 +0800
Subject: [PATCH 06/34] [Semi-Auto] add split spmd rule (#55397)

* add split spmd rule

* add pytest in cmake file

* small fix
---
 .../auto_parallel/spmd_rules/common.cc        |   4 +-
 .../auto_parallel/spmd_rules/rules.h          |   5 +
 .../spmd_rules/split_spmd_rule.cc             | 126 +++++++++++
 .../spmd_rules/split_spmd_rule.h              |  40 ++++
 test/auto_parallel/spmd_rules/CMakeLists.txt  |   1 +
 .../spmd_rules/test_split_rule.py             | 205 ++++++++++++++++++
 6 files changed, 379 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc
 create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h
 create mode 100644 test/auto_parallel/spmd_rules/test_split_rule.py

diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc
index 47c0d9a683fc5..a0f46e1c46299 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.cc
@@ -182,8 +182,8 @@ TensorDistAttr ReplicatedOnMesh(const TensorDistAttr& src_dist_attr) {
 void VerifySpecs(const std::vector<DistTensorSpec>& specs,
                  const std::string& op_name) {
   for (size_t i = 0, n = specs.size(); i < n; ++i) {
-    std::vector<int64_t> shape = specs[i].shape();
-    std::vector<int64_t> dims_mapping = specs[i].dims_mapping();
+    const std::vector<int64_t>& shape = specs[i].shape();
+    const std::vector<int64_t>& dims_mapping = specs[i].dims_mapping();
     PADDLE_ENFORCE_EQ(shape.size(),
                       dims_mapping.size(),
                       phi::errors::InvalidArgument(
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index c58333d0fb701..bba4339198021 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h"
+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h"
 
 // TODO(ljz) Automatic this process in cmake file.
 namespace paddle {
@@ -150,6 +151,10 @@ REGISTER_SPMD_RULE(log_softmax, SoftmaxSPMDRule);
 REGISTER_SPMD_RULE(cross_entropy_with_softmax, CrossEntropyWithSoftmaxSPMDRule);
 REGISTER_SPMD_RULE(softmax_with_cross_entropy, CrossEntropyWithSoftmaxSPMDRule);
 
+// split rule
+REGISTER_SPMD_RULE(split, SplitSPMDRule);
+REGISTER_SPMD_RULE(split_with_num, SplitSPMDRule);
+
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc
new file mode 100644
index 0000000000000..59c962dab8982
--- /dev/null
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h"
+#include <algorithm>
+#include <typeinfo>
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+using phi::distributed::auto_parallel::str_join;
+
+std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+SplitSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
+                            const paddle::framework::AttributeMap& attrs) {
+  // step0: Verify Input Args Based on Elementwise Logic
+  int64_t ninputs = input_specs.size();
+  PADDLE_ENFORCE_EQ(
+      ninputs,
+      1,
+      phi::errors::InvalidArgument("The size of InputSpec in split must "
+                                   "be equal to 1, but got [%d].",
+                                   ninputs));
+  VerifySpecs(input_specs, "split");
+
+  // step1: Build Einsum Notation
+  int64_t ndim = input_specs[0].shape().size();
+  int64_t noutput = 0;
+  // split api uses num or sections as attribute
+  if (attrs.find("num") != attrs.end()) {
+    noutput = ExtractAttr<int64_t>("num", attrs);
+  } else if (attrs.find("sections") != attrs.end()) {
+    std::vector<int64_t> sections =
+        ExtractAttr<std::vector<int64_t>>("sections", attrs);
+    noutput = sections.size();
+  }
+  int64_t axis = ExtractAttr<int>("axis", attrs);
+  if (axis < 0) {
+    axis += ndim;
+  }
+  std::string alphabet = "abcdefghijlmnopqrstuvwxyz";
+
+  // get einsum notation for input, use a special
+  // notation 'k' to mark the splitted axis in input
+  std::vector<std::string> input_axes_vec;
+  std::string input_axes = alphabet.substr(0, ndim);
+  input_axes[axis] = 'k';
+  input_axes_vec.emplace_back(input_axes);
+
+  // get einsum notation for output
+  std::string output_axes(input_axes);
+  // the splitted axis cannot be sharded, set its notation
+  // with the special '1' to set its dim mapping to -1.
+  output_axes[axis] = '1';
+
+  // step2: Sharding Propogation
+  // step2.1: merge input shardings
+  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
+  axes_sharding_info = GetAxesDimsMappingPair(input_axes_vec, input_specs);
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(axes_sharding_info);
+
+  // step2.2: infer output dimsmapping from merged input dimsmapping
+  std::vector<int64_t> output_dims_mapping =
+      GetDimsMappingForAxes(output_axes, axis_to_dim_map);
+
+  // get the dist attributes for all outputs, the
+  // dist attributes are same for all outputs.
+  std::vector<TensorDistAttr> output_dist_attrs;
+  for (int64_t i = 0; i < noutput; i++) {
+    output_dist_attrs.emplace_back(
+        CopyTensorDistAttrForOutput(input_specs[0].dist_attr()));
+    output_dist_attrs[i].set_dims_mapping(output_dims_mapping);
+  }
+
+  // step2.3 get new dist attribute for input. the splitted
+  // cannot be sharded, if it is sharded, set it to replicated.
+  std::vector<TensorDistAttr> new_input_dist_attrs;
+  new_input_dist_attrs.emplace_back(input_specs[0].dist_attr());
+  std::vector<int64_t> new_input_dims_mapping(input_specs[0].dims_mapping());
+  new_input_dims_mapping[axis] = -1;
+  new_input_dist_attrs[0].set_dims_mapping(new_input_dims_mapping);
+
+  // Step2.4  handle input tensor partial (TODO)
+  VLOG(4) << "SplitSPMDRule InferForward: ";
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(input_specs[i].shape()) << "] "
+            << "einsum_notation: " << input_axes << " src_dims_mapping: ["
+            << str_join(input_specs[i].dims_mapping()) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(new_input_dist_attrs[i].dims_mapping()) << "]";
+  }
+  for (int64_t i = 0; i < noutput; i++) {
+    VLOG(4) << "Output" << std::to_string(i) << " dims_mapping: ["
+            << str_join(output_dims_mapping) << "]";
+  }
+
+  return {new_input_dist_attrs, output_dist_attrs};
+}
+
+std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+SplitSPMDRule::InferBackward(const std::vector<DistTensorSpec>& output_specs,
+                             const paddle::framework::AttributeMap& attrs) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "InferBackward of SplitPMDRule is NOT implemented yet."));
+
+  return {};
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h
new file mode 100644
index 0000000000000..f974e4cccce05
--- /dev/null
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iterator>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+class SplitSPMDRule : public SPMDRuleBase {
+ public:
+  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+  InferForward(const std::vector<DistTensorSpec>& input_specs,
+               const paddle::framework::AttributeMap& attrs) override;
+
+  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+  InferBackward(const std::vector<DistTensorSpec>& output_specs,
+                const paddle::framework::AttributeMap& attrs) override;
+};
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt
index ed1cf37389eee..1da9d4674c381 100644
--- a/test/auto_parallel/spmd_rules/CMakeLists.txt
+++ b/test/auto_parallel/spmd_rules/CMakeLists.txt
@@ -8,6 +8,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_matmul_rule MODULES test_embedding_rule)
   py_test_modules(test_matmul_rule MODULES test_replicated_rule)
   py_test_modules(test_matmul_rule MODULES test_softmax_rule)
+  py_test_modules(test_split_rule MODULES test_split_rule)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/spmd_rules/test_split_rule.py b/test/auto_parallel/spmd_rules/test_split_rule.py
new file mode 100644
index 0000000000000..1cd32d1bcf2b3
--- /dev/null
+++ b/test/auto_parallel/spmd_rules/test_split_rule.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.distributed.fleet import auto
+
+
+class TestReductionSPMDRule(unittest.TestCase):
+    """
+    Unit tests for split spmd rule.
+    """
+
+    def setUp(self):
+        self.rule = get_spmd_rule("split")
+
+        x_shape = [64, 32, 48]
+        process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [1, 0]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        self.attrs = {
+            'num_or_sections': 2,
+            'axis': 1,
+        }
+
+    def test_single_mesh_dim(self):
+        # num_or_sections = 2, axis = 1
+        # [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1]
+        self.rule = get_spmd_rule("split_with_num")
+        self.attrs = {}
+        self.attrs['num'] = 2
+        self.attrs['axis'] = 1
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 2)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1])
+
+        # num_or_sections = [15, 16, 17], axis = 2
+        # [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1], [0, -1, -1]
+        self.rule = get_spmd_rule("split")
+        self.attrs = {}
+        self.attrs['sections'] = [15, 16, 17]
+        self.attrs['axis'] = 2
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0, -1, -1])
+
+        # num_or_sections = [15, 16, 17], axis = 2
+        # [-1, -1, 0] --> [-1, -1, -1], [-1, -1, -1], [-1 -1, -1], [-1, -1, -1]
+        self.attrs = {}
+        self.attrs['sections'] = [15, 16, 17]
+        self.attrs['axis'] = 2
+        self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1])
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[2].dims_mapping, [-1, -1, -1]
+        )
+
+        # num_or_sections = 2, axis = -2
+        # [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1]
+        self.rule = get_spmd_rule("split_with_num")
+        self.attrs = {}
+        self.attrs['num'] = 2
+        self.attrs['axis'] = -2
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 2)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1])
+
+    def test_multi_mesh_dim(self):
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+        self.x_dist_tensor_spec.set_process_mesh(process_mesh)
+        self.x_dist_tensor_spec.shape = [96, 32, 48, 24]
+
+        # num_or_sections = 3, axis = -1
+        # [0, 1, -1, -1] --> [0, 1, -1, -1], [0, 1, -1, -1], [0, 1, -1, -1], [0, 1, -1, -1]
+        self.rule = get_spmd_rule("split_with_num")
+        self.attrs = {}
+        self.attrs['num'] = 3
+        self.attrs['axis'] = -1
+        self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[2].dims_mapping, [0, 1, -1, -1]
+        )
+
+        # num_or_sections = [32, 32, 32], axis = 0
+        # [0, 1, -1, -1] --> [-1, 1, -1, -1], [-1, 1, -1, -1], [-1, 1, -1, -1], [-1, 1, -1, -1]
+        self.rule = get_spmd_rule("split")
+        self.attrs = {}
+        self.attrs['sections'] = [32, 32, 32]
+        self.attrs['axis'] = 0
+        self.x_dist_tensor_spec.set_dims_mapping([0, 1, -1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[2].dims_mapping, [-1, 1, -1, -1]
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4ff8fca52c7bb2507aba4c21cefae3f09de2a527 Mon Sep 17 00:00:00 2001
From: umiswing <umiswing@foxmail.com>
Date: Mon, 24 Jul 2023 11:07:22 +0800
Subject: [PATCH 07/34] Fix test_sparse_norm_op failure. (#55405)

* Fix test failed on cudnn.

* Fix codestyle.
---
 test/legacy_test/test_sparse_norm_op.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/legacy_test/test_sparse_norm_op.py b/test/legacy_test/test_sparse_norm_op.py
index c17a252ee75a6..25a253d9787b4 100644
--- a/test/legacy_test/test_sparse_norm_op.py
+++ b/test/legacy_test/test_sparse_norm_op.py
@@ -100,19 +100,23 @@ def check(self, shape):
         else:
             bn = paddle.nn.BatchNorm3D(shape[-1], data_format=data_format)
         y = bn(x)
-        y.backward()
+        np.random.seed(5)
+        loss_data = np.random.uniform(-0.01, 0.01, y.shape).astype("float32")
+        loss = paddle.to_tensor(loss_data)
+        y.backward(loss)
 
         sp_x = paddle.to_tensor(data).to_sparse_coo(dim - 1)
         sp_x.stop_gradient = False
         sp_bn = paddle.sparse.nn.BatchNorm(shape[-1], data_format=data_format)
         sp_y = sp_bn(sp_x)
-        sp_y.backward()
+        sp_loss = loss.to_sparse_coo(dim - 1)
+        sp_y.backward(sp_loss)
 
         np.testing.assert_allclose(
-            y.numpy(), sp_y.to_dense().numpy(), rtol=1e-5
+            sp_y.to_dense().numpy(), y.numpy(), rtol=1e-5
         )
         np.testing.assert_allclose(
-            x.grad.numpy(), sp_x.grad.to_dense().numpy(), rtol=1e-5
+            sp_x.grad.to_dense().numpy(), x.grad.numpy(), rtol=1e-5
         )
 
     def test_nd(self):

From 5b8f06377cd1199b6f9c96e7289bced90a9740c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Mon, 24 Jul 2023 11:22:06 +0800
Subject: [PATCH 08/34] delete modification on pre-commit (#55519)

---
 paddle/cinn/auto_schedule/analysis/analyze_ir.cc               | 2 +-
 paddle/cinn/auto_schedule/cost_model/feature_extractor.cc      | 2 +-
 paddle/cinn/auto_schedule/database/jsonfile_database_test.cc   | 2 +-
 .../cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc | 2 +-
 .../auto_schedule/search_space/auto_gen_rule/auto_inline.cc    | 2 +-
 .../auto_schedule/search_space/auto_gen_rule/auto_unroll.cc    | 2 +-
 .../search_space/auto_gen_rule/multi_level_tiling.cc           | 2 +-
 .../cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc | 2 +-
 paddle/cinn/auto_schedule/search_space/search_space.cc         | 2 +-
 .../cinn/auto_schedule/search_strategy/evolutionary_search.cc  | 2 +-
 paddle/cinn/auto_schedule/task/task_optimizer.cc               | 2 +-
 paddle/cinn/auto_schedule/task/task_registry.h                 | 2 +-
 paddle/cinn/backends/codegen_cuda_util.h                       | 2 +-
 paddle/cinn/common/cas.cc                                      | 2 +-
 paddle/cinn/hlir/pe/broadcast.cc                               | 2 +-
 paddle/cinn/hlir/pe/nn.cc                                      | 2 +-
 paddle/cinn/ir/schedule/ir_schedule.cc                         | 2 +-
 paddle/cinn/ir/schedule/ir_schedule_util.cc                    | 2 +-
 paddle/cinn/ir/test/CMakeLists.txt                             | 1 +
 paddle/cinn/{optim => ir/test}/ir_copy_test.cc                 | 2 +-
 paddle/cinn/ir/test/schedule_desc_test.cc                      | 2 +-
 paddle/cinn/ir/utils/CMakeLists.txt                            | 3 ++-
 paddle/cinn/{optim => ir/utils}/ir_copy.cc                     | 2 +-
 paddle/cinn/{optim => ir/utils}/ir_copy.h                      | 0
 paddle/cinn/optim/CMakeLists.txt                               | 2 --
 paddle/cinn/optim/compute_inline_expand.cc                     | 2 +-
 paddle/cinn/optim/ir_replace.cc                                | 2 +-
 paddle/cinn/optim/optimize.cc                                  | 2 +-
 paddle/cinn/optim/replace_call_with_expr.cc                    | 2 +-
 paddle/cinn/optim/replace_var_with_expr.cc                     | 2 +-
 paddle/cinn/optim/transform_gpu_forloop.cc                     | 2 +-
 paddle/cinn/optim/transform_polyfor_to_for.cc                  | 2 +-
 paddle/cinn/optim/unroll_loops.cc                              | 2 +-
 paddle/cinn/optim/vectorize_loops.cc                           | 2 +-
 paddle/cinn/poly/stage.cc                                      | 2 +-
 paddle/cinn/pybind/optim.cc                                    | 2 +-
 36 files changed, 35 insertions(+), 35 deletions(-)
 mode change 100755 => 100644 paddle/cinn/backends/codegen_cuda_util.h
 rename paddle/cinn/{optim => ir/test}/ir_copy_test.cc (95%)
 rename paddle/cinn/{optim => ir/utils}/ir_copy.cc (99%)
 rename paddle/cinn/{optim => ir/utils}/ir_copy.h (100%)
 mode change 100755 => 100644 paddle/cinn/optim/ir_replace.cc

diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index 8adc61ba269b4..17aad495b246a 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -26,10 +26,10 @@
 #include "paddle/cinn/ir/lowered_func.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/lang/lower.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/optimize.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
index 3369ef4b6ba8b..db2d3f62ed6a9 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
@@ -34,8 +34,8 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/transform_polyfor_to_for.h"
 
 namespace cinn {
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
index d977e02dc15c2..9a7bf9d568bc3 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -24,8 +24,8 @@
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
index 3fd38ab0d4ba6..06215d98d8b27 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -17,9 +17,9 @@
 #include <glog/logging.h>
 
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
index db5f8e711d8d9..946947611f35d 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
@@ -27,9 +27,9 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
index 3ea7882b80654..946bd9e9d7730 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
@@ -19,9 +19,9 @@
 #include <cstdlib>
 
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
index b521db598fca0..feecf79a07d10 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
@@ -31,9 +31,9 @@
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
index 9ad9dd143b494..7810822299c8c 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
@@ -19,7 +19,7 @@
 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h"
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 
 namespace cinn {
 namespace auto_schedule {
diff --git a/paddle/cinn/auto_schedule/search_space/search_space.cc b/paddle/cinn/auto_schedule/search_space/search_space.cc
index f3f8ba2f673f9..9ceab068c76d4 100644
--- a/paddle/cinn/auto_schedule/search_space/search_space.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_space.cc
@@ -31,7 +31,7 @@
 #include "paddle/cinn/auto_schedule/task/tune_task.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/runtime/flags.h"
 
 DECLARE_bool(auto_schedule_use_cost_model);
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
index d139cc4c1d309..c2668e1da72d3 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
@@ -31,7 +31,7 @@
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/auto_schedule/task/tune_task.h"
 #include "paddle/cinn/auto_schedule/tuning.h"
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/utils/multi_threading.h"
 #include "paddle/cinn/utils/sized_multi_set.h"
 #include "paddle/cinn/utils/string.h"
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index cd89eedbb75d1..c4a03bde62464 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -30,7 +30,7 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/string.h"
diff --git a/paddle/cinn/auto_schedule/task/task_registry.h b/paddle/cinn/auto_schedule/task/task_registry.h
index 158a899e5b915..c506a7ea332ef 100644
--- a/paddle/cinn/auto_schedule/task/task_registry.h
+++ b/paddle/cinn/auto_schedule/task/task_registry.h
@@ -20,7 +20,7 @@
 #include <string>
 
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/utils/registry.h"
 
 namespace cinn {
diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h
old mode 100755
new mode 100644
index a0f892ca58c02..802d93d9a3144
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
@@ -22,8 +22,8 @@
 
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace backends {
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index ff867858d80f8..b72650301bbfe 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -22,12 +22,12 @@
 #include "paddle/cinn/common/arithmatic.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
 #include "paddle/cinn/optim/cast_simplify.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/utils/string.h"
 
 namespace cinn {
diff --git a/paddle/cinn/hlir/pe/broadcast.cc b/paddle/cinn/hlir/pe/broadcast.cc
index 29abc69587f5e..30ed792869276 100644
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
@@ -20,9 +20,9 @@
 #include "paddle/cinn/hlir/op/op_util.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/hlir/pe/nn.cc b/paddle/cinn/hlir/pe/nn.cc
index 26929568c901f..030cd7472e430 100644
--- a/paddle/cinn/hlir/pe/nn.cc
+++ b/paddle/cinn/hlir/pe/nn.cc
@@ -29,9 +29,9 @@
 #include "paddle/cinn/hlir/pe/nn_util.h"
 #include "paddle/cinn/hlir/pe/schedule.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-#include "paddle/cinn/optim/ir_copy.h"
 
 namespace cinn {
 namespace hlir {
diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc
index af0db05f36af0..88609c7a7eb9b 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule.cc
@@ -32,11 +32,11 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_error.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
 #include "paddle/cinn/lang/compute.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 #include "paddle/cinn/utils/string.h"
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index 70bc7b0415b88..adfe5fdcef861 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -27,11 +27,11 @@
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
 #include "paddle/cinn/lang/compute.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 
diff --git a/paddle/cinn/ir/test/CMakeLists.txt b/paddle/cinn/ir/test/CMakeLists.txt
index de5816d9e808d..bef31ed067e3b 100644
--- a/paddle/cinn/ir/test/CMakeLists.txt
+++ b/paddle/cinn/ir/test/CMakeLists.txt
@@ -16,3 +16,4 @@ cinn_cc_test(test_intrinsic_ops SRCS intrinsic_ops_test.cc DEPS cinncore)
 cinn_cc_test(test_ir_verify SRCS ir_verify_test.cc DEPS cinncore)
 cinn_cc_test(test_schedule_desc SRCS schedule_desc_test.cc DEPS cinncore)
 cinn_cc_test(test_ir_compare SRCS ir_compare_test.cc DEPS cinncore)
+cinn_cc_test(test_ir_copy SRCS ir_copy_test.cc DEPS cinncore)
diff --git a/paddle/cinn/optim/ir_copy_test.cc b/paddle/cinn/ir/test/ir_copy_test.cc
similarity index 95%
rename from paddle/cinn/optim/ir_copy_test.cc
rename to paddle/cinn/ir/test/ir_copy_test.cc
index 91128264853a1..cd3199d4947dd 100644
--- a/paddle/cinn/optim/ir_copy_test.cc
+++ b/paddle/cinn/ir/test/ir_copy_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/cinn/ir/test/schedule_desc_test.cc b/paddle/cinn/ir/test/schedule_desc_test.cc
index 1cefd9a1b1971..66a01b14b08d7 100644
--- a/paddle/cinn/ir/test/schedule_desc_test.cc
+++ b/paddle/cinn/ir/test/schedule_desc_test.cc
@@ -20,9 +20,9 @@
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/lang/lower.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/utils/string.h"
 #include "paddle/cinn/utils/type_defs.h"
 
diff --git a/paddle/cinn/ir/utils/CMakeLists.txt b/paddle/cinn/ir/utils/CMakeLists.txt
index 58d9c3b59dc1a..5613bf7260155 100644
--- a/paddle/cinn/ir/utils/CMakeLists.txt
+++ b/paddle/cinn/ir/utils/CMakeLists.txt
@@ -8,4 +8,5 @@ gather_srcs(
   ir_printer.cc
   ir_verify.cc
   ir_compare.cc
-  ir_nodes_collector.cc)
+  ir_nodes_collector.cc
+  ir_copy.cc)
diff --git a/paddle/cinn/optim/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc
similarity index 99%
rename from paddle/cinn/optim/ir_copy.cc
rename to paddle/cinn/ir/utils/ir_copy.cc
index 3bf7eb323a8fe..22d7c99bcd322 100644
--- a/paddle/cinn/optim/ir_copy.cc
+++ b/paddle/cinn/ir/utils/ir_copy.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 
 #include <map>
 #include <memory>
diff --git a/paddle/cinn/optim/ir_copy.h b/paddle/cinn/ir/utils/ir_copy.h
similarity index 100%
rename from paddle/cinn/optim/ir_copy.h
rename to paddle/cinn/ir/utils/ir_copy.h
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index c8313a91af439..b6d3632d4bc7e 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -5,7 +5,6 @@ gather_srcs(
   SRCS
   remove_nested_block.cc
   replace_call_with_expr.cc
-  ir_copy.cc
   ir_replace.cc
   replace_var_with_expr.cc
   tensor_write_tell.cc
@@ -38,7 +37,6 @@ endif()
 
 cinn_cc_test(test_remove_nested_block SRCS remove_nested_block_test.cc DEPS
              cinncore)
-cinn_cc_test(test_ir_copy SRCS ir_copy_test.cc DEPS cinncore)
 cinn_cc_test(test_ir_simplify SRCS ir_simplify_test.cc DEPS cinncore)
 cinn_cc_test(test_replace_call_with_expr SRCS replace_call_with_expr_test.cc
              DEPS cinncore)
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index d18d5a45f6375..8dad52ab4d9bc 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -18,8 +18,8 @@
 #include <string>
 
 #include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
diff --git a/paddle/cinn/optim/ir_replace.cc b/paddle/cinn/optim/ir_replace.cc
old mode 100755
new mode 100644
index 815fe4a15b2d9..3dc39a08a3817
--- a/paddle/cinn/optim/ir_replace.cc
+++ b/paddle/cinn/optim/ir_replace.cc
@@ -16,9 +16,9 @@
 
 #include <set>
 
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/utils/string.h"
 
 namespace cinn {
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index 2acf8c331d11a..f7ff0643cdeb3 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -15,6 +15,7 @@
 #include "paddle/cinn/optim/optimize.h"
 
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/optim/call_arg_list_to_pod_value.h"
 #include "paddle/cinn/optim/cast_bool_to_int8.h"
@@ -24,7 +25,6 @@
 #include "paddle/cinn/optim/fold_cinn_call_arguments.h"
 #include "paddle/cinn/optim/if_simplify.h"
 #include "paddle/cinn/optim/insert_debug_log_callee.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/lower_function_call_bind_vars.h"
 #include "paddle/cinn/optim/lower_intrin.h"
diff --git a/paddle/cinn/optim/replace_call_with_expr.cc b/paddle/cinn/optim/replace_call_with_expr.cc
index 26dc7662ec5ac..d63210d1d28f1 100644
--- a/paddle/cinn/optim/replace_call_with_expr.cc
+++ b/paddle/cinn/optim/replace_call_with_expr.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/cinn/optim/replace_call_with_expr.h"
 
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 
 namespace cinn {
diff --git a/paddle/cinn/optim/replace_var_with_expr.cc b/paddle/cinn/optim/replace_var_with_expr.cc
index a8c9f8cd961d0..57ab47d7c0250 100644
--- a/paddle/cinn/optim/replace_var_with_expr.cc
+++ b/paddle/cinn/optim/replace_var_with_expr.cc
@@ -18,9 +18,9 @@
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_const_param_to_integer.h"
 
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 0b95313034ba2..d12a5c9f2dab8 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -24,9 +24,9 @@
 #include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 #include "paddle/cinn/poly/isl_utils.h"
diff --git a/paddle/cinn/optim/transform_polyfor_to_for.cc b/paddle/cinn/optim/transform_polyfor_to_for.cc
index 82d500d5dca06..5efc0a1f44be3 100644
--- a/paddle/cinn/optim/transform_polyfor_to_for.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for.cc
@@ -22,10 +22,10 @@
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 
 namespace cinn {
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index 0510774805992..fc5fab85eca5f 100644
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -18,9 +18,9 @@
 #include <vector>
 
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_replace.h"
 
 namespace cinn {
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index e15ab71d46729..745bec47b4507 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -26,9 +26,9 @@
 #include "paddle/cinn/common/cas.h"
 #include "paddle/cinn/common/ir_util.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_replace.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/tensor_write_tell.h"
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index e5faafc5348fc..faa7a99c0cfde 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -24,12 +24,12 @@
 #include "paddle/cinn/common/axis.h"
 #include "paddle/cinn/ir/operation.h"
 #include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
 #include "paddle/cinn/lang/compute.h"
-#include "paddle/cinn/optim/ir_copy.h"
 #include "paddle/cinn/optim/ir_replace.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
diff --git a/paddle/cinn/pybind/optim.cc b/paddle/cinn/pybind/optim.cc
index dfb159a23f352..00219477e8f85 100755
--- a/paddle/cinn/pybind/optim.cc
+++ b/paddle/cinn/pybind/optim.cc
@@ -18,7 +18,7 @@
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
-#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/pybind/bind.h"
 #include "paddle/cinn/pybind/bind_utils.h"

From bea1f04c300ab932fd96f27ee1304f507e7bb6ba Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Mon, 24 Jul 2023 06:33:45 +0000
Subject: [PATCH 09/34] onednn: remove fc_elementwise_add fusion (#55504)

* onednn: remove fc+eltwiseadd fusion pass
* onednn: remove post-sum fusion in fc kernel
* onednn: tests: make unfused add run into f32
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 -
 .../fc_elementwise_add_mkldnn_fuse_pass.cc    | 165 ------------------
 .../fc_elementwise_add_mkldnn_fuse_pass.h     |  48 -----
 .../inference/api/paddle_pass_builder.cc      |   7 +-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |  35 +---
 .../api/analysis_predictor_tester.cc          |   4 +-
 .../api/analyzer_ernie_int8_tester.cc         |   2 +-
 .../inference/api/analyzer_vit_ocr_tester.cc  |   1 -
 test/ir/inference/CMakeLists.txt              |   2 -
 ...est_onednn_fc_elementwise_add_fuse_pass.py | 104 -----------
 10 files changed, 5 insertions(+), 364 deletions(-)
 delete mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
 delete mode 100644 paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
 delete mode 100644 test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index f4b88178499be..fa0fa7fccfd61 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -174,7 +174,6 @@ if(WITH_MKLDNN)
   pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
   pass_library(params_quantization_mkldnn_pass inference DIR mkldnn)
-  pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
   pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
   pass_library(cpu_bfloat16_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
deleted file mode 100644
index ef01acd88c0b7..0000000000000
--- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.cc
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h"
-
-#include "paddle/fluid/framework/ir/graph_traits.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/utils/string/pretty_log.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-FCResidualConnectionMKLDNNFusePass::FCResidualConnectionMKLDNNFusePass() {
-  AddOpCompat(OpCompat("fc"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("W")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("in_num_col_dims")
-      .IsNumGE(1)
-      .End();
-
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsIntIn({-1, 0, 1})
-      .End();
-}
-
-GraphWithStats FCResidualConnectionMKLDNNFusePass::FuseFC(
-    const std::string& name_scope,
-    const GraphWithStats& graph_with_stats,
-    bool fc_as_x) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::FCMKLDNN fc_pattern{pattern, name_scope};
-  auto fc_output = fc_pattern(false /* with residual */);
-
-  patterns::ResidualElementwise elementwise_pattern{
-      pattern, name_scope, fc_as_x};
-  elementwise_pattern(
-      fc_output,
-      pattern->NewNode(elementwise_pattern.residual_data_repr()),
-      "elementwise_add",
-      fc_as_x);
-  fc_output->AsIntermediate();
-
-  int found_fc_count = 0;
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Fuse fc + elementwise_add as residual";
-    GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_input, input, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_weights, weights, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_output, output, fc_pattern);
-
-    GET_IR_NODE_FROM_SUBGRAPH(
-        elementwise_op, elementwise_op, elementwise_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        residual_data, residual_data, elementwise_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        elementwise_out, elementwise_out, elementwise_pattern);
-
-    if (FindFuseOption(*fc_op, *elementwise_op) != FUSE_MKLDNN) {
-      VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
-              << ") with " << elementwise_op->Name() << "("
-              << elementwise_op->id()
-              << ") because not both ops have use_mkldnn";
-      return;
-    }
-    if (!IsReachable(g, residual_data, fc_output)) {
-      VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
-              << ") with " << elementwise_op->Name() << "("
-              << elementwise_op->id() << ") because residual input "
-              << residual_data->Name() << "(" << residual_data->id()
-              << ") is not "
-                 "reachable";
-      return;
-    }
-    if (HasFusedActivation(fc_op)) {
-      VLOG(4) << "Skipping fusion for " << fc_op->Name() << "(" << fc_op->id()
-              << ") with " << elementwise_op->Name() << "("
-              << elementwise_op->id() << ") because fc has activation fused";
-      return;
-    }
-
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING)
-          << "op compat for fc_elementwise_add_mkldnn_fuse_pass failed.";
-      return;
-    }
-
-    fc_op->Op()->SetInput("ResidualData", {residual_data->Name()});
-    fc_op->Op()->SetOutput("Out", {elementwise_out->Name()});
-    fc_op->Op()->SetAttr("fuse_residual_connection", true);
-
-    GraphSafeRemoveNodes(g, {fc_output, elementwise_op});
-
-    IR_NODE_LINK_TO(residual_data, fc_op);
-    IR_NODE_LINK_TO(fc_op, elementwise_out);
-
-    found_fc_count++;
-  };
-
-  gpd(graph_with_stats.first, handler);
-  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
-      (found_fc_count > 0)) {
-    std::stringstream msg_ss;
-    std::string fusionMode = fc_as_x ? "x" : "y";
-    msg_ss << "---    Fused " << found_fc_count << " fc (as " << fusionMode
-           << ") + elementwise_add patterns";
-    paddle::string::PrettyLogDetail(msg_ss.str().c_str());
-  }
-
-  return std::make_pair(graph_with_stats.first,
-                        found_fc_count + graph_with_stats.second);
-}
-
-void FCResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const {
-  FusePassBase::Init(name_scope_, graph);
-  auto graph_with_stats = FuseFC(name_scope_, std::make_pair(graph, 0), true);
-  graph_with_stats = FuseFC(name_scope_, graph_with_stats, false);
-
-  AddStatis(graph_with_stats.second);
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(fc_elementwise_add_mkldnn_fuse_pass,
-              paddle::framework::ir::FCResidualConnectionMKLDNNFusePass);
-REGISTER_PASS_CAPABILITY(fc_elementwise_add_mkldnn_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("fc", 0)
-            .LE("elementwise_add", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
deleted file mode 100644
index f92ce5bfc7044..0000000000000
--- a/paddle/fluid/framework/ir/mkldnn/fc_elementwise_add_mkldnn_fuse_pass.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using GraphWithStats = std::pair<ir::Graph*, int>;
-
-class FCResidualConnectionMKLDNNFusePass : public FusePassBase {
- private:
-  GraphWithStats FuseFC(const std::string& name_scope,
-                        const GraphWithStats& graph_with_stats,
-                        bool fc_as_x) const;
-
- public:
-  FCResidualConnectionMKLDNNFusePass();
-  virtual ~FCResidualConnectionMKLDNNFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph* graph) const;
-
-  static bool HasFusedActivation(Node* fc_node) {
-    return !(
-        fc_node->Op()->GetAttrIfExists<std::string>("activation_type").empty());
-  }
-
-  const std::string name_scope_{"fc_elementwise_add_mkldnn_fuse"};
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index b2a8fc86c2305..18cdf2c624545 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -372,7 +372,6 @@ void CpuPassStrategy::EnableMKLDNN() {
              // Disabled due to topology-dependent speed-up
              "fc_mkldnn_pass",
              "fc_act_mkldnn_fuse_pass",
-             "fc_elementwise_add_mkldnn_fuse_pass",   //
              "self_attention_fuse_pass",              //
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_onednn_fuse_pass",  //
@@ -407,7 +406,6 @@ void CpuPassStrategy::EnableMkldnnBfloat16() {
   if (!use_mkldnn_bfloat16_) {
     passes_.push_back("fc_mkldnn_pass");
     passes_.push_back("fc_act_mkldnn_fuse_pass");
-    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
 
     passes_.push_back("cpu_bfloat16_placement_pass");
     passes_.push_back("cpu_bfloat16_pass");
@@ -463,7 +461,6 @@ void CpuPassStrategy::EnableMkldnnInt8() {
     passes_.push_back("repeated_fc_relu_fuse_pass");
     passes_.push_back("fc_mkldnn_pass");
     passes_.push_back("fc_act_mkldnn_fuse_pass");
-    passes_.push_back("fc_elementwise_add_mkldnn_fuse_pass");
     passes_.push_back("matmul_transpose_reshape_mkldnn_fuse_pass");
     passes_.push_back("batch_norm_act_fuse_pass");
     passes_.push_back("softplus_activation_onednn_fuse_pass");
@@ -498,9 +495,7 @@ void CpuPassStrategy::DisableMkldnnFcPasses() {
 
 void CpuPassStrategy::EraseFcMkldnnPasses() {
   std::vector<std::string> fc_passes_to_erase(
-      {"fc_mkldnn_pass",
-       "fc_act_mkldnn_fuse_pass",
-       "fc_elementwise_add_mkldnn_fuse_pass"});
+      {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
   for (const auto &pass : fc_passes_to_erase) {
     int idx = GetPassIndex(pass);
     if (idx != -1) {
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 5cf06bcbbdcf0..18680fe678b5d 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -46,9 +46,6 @@ GetDNNLScales(const ExecutionContext& ctx) {
   auto scale_in_data = ctx.Attr<float>("Scale_in");
   auto scale_out = ctx.Attr<float>("Scale_out");
   auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
-  auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise")
-                                   ? ctx.Attr<float>("Scale_in_eltwise")
-                                   : 1.0f;
 
   std::vector<float> dnnl_src_scales = {1.f / scale_in_data};
   size_t count = scale_weights_data.size();
@@ -57,7 +54,7 @@ GetDNNLScales(const ExecutionContext& ctx) {
   for (size_t i = 0; i < count; i++) {
     dnnl_wei_scales[i] = 1.f / scale_weights_data[i];
   }
-  std::vector<float> dnnl_psum_scales = {1.f / scale_in_eltwise_data};
+  std::vector<float> dnnl_psum_scales = {1.f};
   std::vector<float> dnnl_dst_scales = {1.f / scale_out};
 
   return std::make_tuple(
@@ -127,7 +124,6 @@ class FCMKLDNNHandler
     dnnl::primitive_attr attributes;
     dnnl::post_ops post_operations;
 
-    float sum_scale = 1.0f;
     float activation_scale = 1.0f;
     if (phi::funcs::is_int8<T_w>()) {
       std::vector<float> src_scales, wei_scales, psum_scales, dst_scales;
@@ -168,13 +164,6 @@ class FCMKLDNNHandler
                dst_scales.data(),
                dst_scales.size() * sizeof(float));
       }
-
-      sum_scale = psum_scales[0];
-    }
-
-    if (ctx.HasAttr("fuse_residual_connection") &&
-        ctx.Attr<bool>("fuse_residual_connection")) {
-      post_operations.append_sum(sum_scale);
     }
 
     // ReLU from "fc_fuse_pass"
@@ -332,22 +321,6 @@ class FCMKLDNNHandler
 
   std::shared_ptr<dnnl::memory> AcquireCustomDstMemory(
       const ExecutionContext& ctx, phi::DenseTensor* out) {
-    if (ctx.HasAttr("fuse_residual_connection") &&
-        ctx.Attr<bool>("fuse_residual_connection")) {
-      auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
-
-      PADDLE_ENFORCE_EQ(
-          out->dims(),
-          residual_param->dims(),
-          phi::errors::InvalidArgument(
-              "Output and elementwise parameter need to have the "
-              "same dimension sizes, but got output's dimension = %d"
-              " and residual param's dimension =%d .",
-              out->dims().size(),
-              residual_param->dims().size()));
-
-      out->ShareDataWith(*residual_param);
-    }
     return this->template AcquireDstMemory<T_out>(out);
   }  // namespace operators
 
@@ -458,11 +431,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
 
       dst_memory_p =
           std::make_shared<dnnl::memory>(inner_product_cache->dst_mem);
-      if (ctx.HasAttr("fuse_residual_connection") &&
-          ctx.Attr<bool>("fuse_residual_connection")) {
-        auto* residual_param = ctx.Input<phi::DenseTensor>("ResidualData");
-        out->ShareDataWith(*residual_param);
-      }
+
       auto out_ptr = out->mutable_data<T_out>(
           ctx.GetPlace(), dst_memory_p->get_desc().get_size());
       dst_memory_p->set_data_handle(out_ptr);
diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index 6e3497d14a0dd..0827c3c196c19 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -370,9 +370,7 @@ TEST(AnalysisPredictor, mkldnn_fc_passes_cpu_pass_strategy) {
   CpuPassStrategy cpuPassStrategy;
   cpuPassStrategy.EnableMKLDNN();
   const std::vector<std::string> fc_passes_to_erase(
-      {"fc_mkldnn_pass",
-       "fc_act_mkldnn_fuse_pass",
-       "fc_elementwise_add_mkldnn_fuse_pass"});
+      {"fc_mkldnn_pass", "fc_act_mkldnn_fuse_pass"});
   for (const auto& pass : fc_passes_to_erase) {
     ASSERT_NE(cpuPassStrategy.GetPassIndex(pass), (size_t)-1);
   }
diff --git a/test/cpp/inference/api/analyzer_ernie_int8_tester.cc b/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
index a93bbb1ab7e24..f8e7eb9ee8ad5 100644
--- a/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
+++ b/test/cpp/inference/api/analyzer_ernie_int8_tester.cc
@@ -34,7 +34,7 @@ void SetInt8Config(AnalysisConfig *cfg,
   pass_builder->DeletePass("constant_folding_pass");
   auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(data);
   cfg->mkldnn_quantizer_config()->SetEnabledOpTypes(
-      {"elementwise_add", "matmul", "matmul_v2", "fused_matmul"});
+      {"matmul", "matmul_v2", "fused_matmul"});
   // Exclusion of several matmules that should not be quantized due to the fact
   // that they reduce the accuracy of the model
   cfg->mkldnn_quantizer_config()->SetExcludedOpIds(
diff --git a/test/cpp/inference/api/analyzer_vit_ocr_tester.cc b/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
index 3582fc22c9cc2..43457a05b4b48 100644
--- a/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
+++ b/test/cpp/inference/api/analyzer_vit_ocr_tester.cc
@@ -104,7 +104,6 @@ TEST(Analyzer_vit_ocr, fuse_status) {
 
   CHECK_EQ(fuse_statis.at("fc_mkldnn_pass"), 33);
   CHECK_EQ(fuse_statis.at("fused_conv2d_gelu_mkldnn_fuse_pass"), 2);
-  CHECK_EQ(fuse_statis.at("fc_elementwise_add_mkldnn_fuse"), 16);
 }
 #endif
 
diff --git a/test/ir/inference/CMakeLists.txt b/test/ir/inference/CMakeLists.txt
index 1932ad37490ff..1e7fcfa6a1005 100755
--- a/test/ir/inference/CMakeLists.txt
+++ b/test/ir/inference/CMakeLists.txt
@@ -301,8 +301,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300)
     set_tests_properties(test_onednn_fc_activation_fuse_pass PROPERTIES TIMEOUT
                                                                         300)
-    set_tests_properties(test_onednn_fc_elementwise_add_fuse_pass
-                         PROPERTIES TIMEOUT 120)
     set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass
                          PROPERTIES TIMEOUT 60)
   endif()
diff --git a/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py b/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py
deleted file mode 100644
index 9c27452aff5c4..0000000000000
--- a/test/ir/inference/test_onednn_fc_elementwise_add_fuse_pass.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from functools import partial
-
-import hypothesis.strategies as st
-import numpy as np
-from auto_scan_test import PassAutoScanTest
-from program_config import OpConfig, ProgramConfig, TensorConfig
-
-
-class TestFCElementwiseAddOneDNNFusePass(PassAutoScanTest):
-    def sample_program_config(self, draw):
-        axis = draw(st.sampled_from([-1, 0, 1]))
-        fc_as_x = draw(st.booleans())
-        fc_in = draw(st.sampled_from([32, 64]))
-        fc_wei = draw(st.sampled_from([32, 64]))
-
-        def generate_data(shape):
-            return np.random.random(shape).astype(np.float32)
-
-        relu_op = OpConfig(
-            type='relu',
-            inputs={'X': ['input_data']},
-            outputs={'Out': ['relu_out']},
-            attrs={},
-        )
-
-        fc_op = OpConfig(
-            type='fc',
-            inputs={
-                'Input': ['relu_out'],
-                'W': ['fc_weight'],
-                'Bias': ['fc_bias'],
-            },
-            outputs={'Out': ['fc_output']},
-            attrs={
-                'use_mkldnn': True,
-                'padding_weights': False,
-                'activation_type': '',
-                'in_num_col_dims': 1,
-            },
-        )
-
-        if fc_as_x:
-            inputs = {'X': ['fc_output'], 'Y': ['input_data']}
-        else:
-            inputs = {'X': ['input_data'], 'Y': ['fc_output']}
-
-        elt_add_op = OpConfig(
-            type='elementwise_add',
-            inputs=inputs,
-            outputs={'Out': ['elementwise_output']},
-            attrs={'axis': axis, 'use_mkldnn': True},
-        )
-
-        model_net = [relu_op, fc_op, elt_add_op]
-
-        program_config = ProgramConfig(
-            ops=model_net,
-            weights={
-                'fc_weight': TensorConfig(
-                    data_gen=partial(generate_data, [fc_wei, fc_wei])
-                ),
-                'fc_bias': TensorConfig(
-                    data_gen=partial(generate_data, [fc_wei])
-                ),
-            },
-            inputs={
-                'input_data': TensorConfig(
-                    data_gen=partial(generate_data, [fc_in, fc_wei])
-                )
-            },
-            outputs=['elementwise_output'],
-        )
-
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(
-            use_mkldnn=True, passes=['fc_elementwise_add_mkldnn_fuse_pass']
-        )
-        yield config, ['relu', 'fc'], (1e-5, 1e-5)
-
-    def test(self):
-        self.run_and_statis(
-            quant=False, passes=['fc_elementwise_add_mkldnn_fuse_pass']
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()

From 81bd57c75dd8824146536557c97dadd891c1e927 Mon Sep 17 00:00:00 2001
From: Fisher <fisheryung@outlook.com>
Date: Mon, 24 Jul 2023 14:36:18 +0800
Subject: [PATCH 10/34] [CINN] Remove threshold in op mapper relu6 (#55611)

* Just set threshold to 6 in op mapper relu6

* Remove attrs in op mapper relu6
---
 paddle/cinn/frontend/paddle_model_to_program.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/paddle/cinn/frontend/paddle_model_to_program.cc b/paddle/cinn/frontend/paddle_model_to_program.cc
index 316712ff40e61..8a9ca73af2654 100644
--- a/paddle/cinn/frontend/paddle_model_to_program.cc
+++ b/paddle/cinn/frontend/paddle_model_to_program.cc
@@ -398,12 +398,6 @@ void PaddleModelToProgram::AddOpMapper_relu6() {
     CHECK_EQ(op_desc.Output("Out").size(), 1UL);
     auto out_name = op_desc.Output("Out").front();
 
-    absl::flat_hash_map<std::string, hlir::framework::NodeAttr::attr_t> attrs;
-    CHECK(op_desc.HasAttr("threshold"));
-    CHECK_EQ(op_desc.GetAttr<float>("threshold"), 6.0f)
-        << "Threshold of Relu6 is not 6! To be implemented.";
-    attrs["threshold"] = op_desc.GetAttr<float>("threshold");
-
     auto x = GetVar(TransValidVarName(x_name));
     auto out = net_builder_->Relu6(x);
     AddVar(TransValidVarName(out_name), out);

From b10b899cd686ad994f85f8ecb14bc40e9c95dce4 Mon Sep 17 00:00:00 2001
From: houj04 <35131887+houj04@users.noreply.github.com>
Date: Mon, 24 Jul 2023 14:44:42 +0800
Subject: [PATCH 11/34] [PHI] add fused_softmax_mask and
 fused_softmax_mask_grad for CPU. (#55616)

---
 .../cpu/fused_softmax_mask_grad_kernel.cc     | 39 +++++++++
 .../fusion/cpu/fused_softmax_mask_kernel.cc   | 84 +++++++++++++++++++
 test/legacy_test/test_softmax_mask_fuse_op.py | 10 +--
 3 files changed, 125 insertions(+), 8 deletions(-)
 create mode 100644 paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc

diff --git a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc
new file mode 100644
index 0000000000000..eb94d71b956a0
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_grad_kernel.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void FusedSoftmaxMaskGradKernel(const Context& dev_ctx,
+                                const DenseTensor& out,
+                                const DenseTensor& out_grad,
+                                DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  SoftmaxGradKernel<T, Context>(
+      dev_ctx, out, out_grad, 3, x_grad);  // axis for softmax
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_softmax_mask_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedSoftmaxMaskGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc
new file mode 100644
index 0000000000000..5ed56188ceea6
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cpu/fused_softmax_mask_kernel.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/softmax_kernel.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void FusedSoftmaxMaskKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& mask,
+                            DenseTensor* out) {
+  auto x_dim = x.dims();
+  auto mask_dim = mask.dims();
+  auto query_seq_len = x_dim[2];
+  auto key_seq_len = x_dim[3];
+
+  PADDLE_ENFORCE_GT(query_seq_len,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Input x's second last dim must be large than 1 but "
+                        "received the second last dimension of x is %d",
+                        query_seq_len));
+
+  PADDLE_ENFORCE_EQ(key_seq_len >= 32 && key_seq_len < 8192,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input x's last dim must be between [32, 8192) "
+                        "received the last dimension of x is %d",
+                        key_seq_len));
+
+  PADDLE_ENFORCE_EQ(mask_dim[1],
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Input mask's second dim must be 1 "
+                        "received the second dimension of mask is %d",
+                        mask_dim[1]));
+
+  // dim of x and mask must be equal
+  for (size_t idx = 0; idx < 4; ++idx) {
+    if (idx == 1) continue;
+    PADDLE_ENFORCE_EQ(
+        x_dim[idx],
+        mask_dim[idx],
+        phi::errors::InvalidArgument(
+            "Input x's %dth dim should be equal with input mask's %dth dim "
+            "but "
+            "received the %dth dimension of x and mask are not equal "
+            "the %dth dim of x is %d, while the %dth dim of mask is %d.",
+            idx,
+            idx,
+            idx,
+            idx,
+            x_dim[idx],
+            idx,
+            mask_dim[idx]));
+  }
+  DenseTensor t = phi::Add<T, Context>(dev_ctx, x, mask);
+  SoftmaxKernel<T, Context>(dev_ctx, t, 3, out);  // axis for softmax
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_softmax_mask,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedSoftmaxMaskKernel,
+                   float,
+                   double) {}
diff --git a/test/legacy_test/test_softmax_mask_fuse_op.py b/test/legacy_test/test_softmax_mask_fuse_op.py
index 13bf6efe56889..79c6ad8c93525 100644
--- a/test/legacy_test/test_softmax_mask_fuse_op.py
+++ b/test/legacy_test/test_softmax_mask_fuse_op.py
@@ -51,16 +51,10 @@ def setUp(self):
         self.outputs = {'Out': rst}
 
     def test_check_output(self):
-        try:
-            self.check_output_with_place(core.CPUPlace())
-        except NotImplementedError:
-            pass
+        self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad(self):
-        try:
-            self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
-        except NotImplementedError:
-            pass
+        self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
 
 
 @unittest.skipIf(

From a9f877ff0a547755e44602196f093aebdbae2f37 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Mon, 24 Jul 2023 15:35:47 +0800
Subject: [PATCH 12/34] [sharding stage 1 optim] Sharding comm overlap with
 backward (#55598)

---
 .../framework/distributed_strategy.proto      |   2 +
 .../dygraph_sharding_optimizer.py             |  50 +++-
 .../fleet/meta_parallel/pipeline_parallel.py  |  12 +-
 .../fleet/meta_parallel/pp_utils/utils.py     | 124 --------
 .../fleet/utils/tensor_fusion_helper.py       | 273 +++++++++++++++++-
 ...rid_parallel_sharding_model_with_fusion.py |   2 +
 test/legacy_test/test_fused_comm_buffer.py    |   2 +-
 7 files changed, 307 insertions(+), 158 deletions(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 74f377fd875de..ce489352d3bcf 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -68,6 +68,8 @@ message PpConfig {
 
 message DygraphShardingConfig {
   optional bool tensor_fusion = 1 [ default = false ];
+  optional int32 accumulate_steps = 2 [ default = 1 ];
+  optional bool comm_overlap = 3 [ default = false ];
 }
 
 message HybridConfig {
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 63d261e2e3dfe..ccb5bfdcd030a 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -78,12 +78,23 @@ def __init__(self, optimizer, hcg):
         self.tensor_fusion = strategy.hybrid_configs[
             'sharding_configs'
         ].tensor_fusion
+        self.accumulate_steps = strategy.hybrid_configs[
+            'sharding_configs'
+        ].accumulate_steps
+        self.comm_overlap = strategy.hybrid_configs[
+            'sharding_configs'
+        ].comm_overlap
         pp_overlap = strategy.hybrid_configs['pp_configs'].sharding_comm_overlap
-        if self.tensor_fusion:
+        if self.tensor_fusion or self.comm_overlap:
             assert (
                 not pp_overlap
             ), "Can not enable pp's sharding_comm_overlap and sharding's tensor_fusion at the same time."
 
+        self._use_main_grad = hasattr(self._parameter_list[0], "main_grad")
+        self._rank2decay = {}
+        self._rank2fused = {}
+        self._comm_buffers = []
+
         self._rank2params = self._partition_parameters()
         self._param2rank = self._map_param_to_rank()
 
@@ -95,25 +106,22 @@ def __init__(self, optimizer, hcg):
                 '_param_groups', self._rank2params[self._sharding_rank]
             )
         else:
-            self._use_main_grad = hasattr(self._parameter_list[0], "main_grad")
-            self._rank2decay = {}
-            self._rank2fused = {}
             self._tensor_fusion()
 
             decay_params = [
                 p.name for p in self._rank2decay[self._sharding_rank]
             ]
-            all_params = self._rank2fused[self._sharding_rank]
+            fused_params = self._rank2fused[self._sharding_rank]
             apply_decay_param_fun = lambda x: x in decay_params
 
-            params = []
+            all_fused_params = []
             for v in self._rank2fused.values():
-                params += v
-            self._parameter_list = params
-            self._param_groups = params
+                all_fused_params += v
+            self._parameter_list = all_fused_params
+            self._param_groups = all_fused_params
 
-            self._set_inner_opt_attr('_parameter_list', all_params)
-            self._set_inner_opt_attr('_param_groups', all_params)
+            self._set_inner_opt_attr('_parameter_list', fused_params)
+            self._set_inner_opt_attr('_param_groups', fused_params)
             origin_decay_param_fun = getattr(
                 self._inner_opt, '_apply_decay_param_fun', None
             )
@@ -145,11 +153,23 @@ def clear_grad(self, set_to_zero=True):
                     p.clear_gradient(set_to_zero)
 
     def _tensor_fusion(self):
+        comm_group = self._hcg.get_sharding_parallel_group()
         for i in range(self._sharding_world_size):
             params = self._rank2params[i]
-            decay_fused, all_fused = fused_parameters(
-                params, self._use_main_grad
+            dst = comm_group.ranks[i]
+            # TODO(sharding dev): make scale_after_comm a field to be configured by user
+            decay_fused, all_fused, all_buffer = fused_parameters(
+                params,
+                use_main_grad=self._use_main_grad,
+                fuse_param=True,
+                comm_overlap=self.comm_overlap,
+                comm_group=comm_group,
+                dst=dst,
+                acc_step=self.accumulate_steps,
+                scale_after_comm=False,
             )
+            if self.comm_overlap:
+                self._comm_buffers += all_buffer
             self._rank2decay[i] = decay_fused
             self._rank2fused[i] = all_fused
             for p in all_fused:
@@ -199,6 +219,10 @@ def _map_param_to_rank(self):
     def reduce_gradients(self, parameter_list, hcg):
         # TODO merge grad / nrank with dp
         logger.debug("sharding start gradients sync")
+        if self.comm_overlap:
+            for buffer in self._comm_buffers:
+                buffer.scale_grads()
+            return
         with framework.no_grad():
             sharding_nrank = hcg.get_sharding_parallel_group().nranks
             for param in parameter_list:
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 6644e2a06e5fe..2038a4c4e4606 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -37,11 +37,11 @@
     from .pp_utils import p2p_communication as p2p
 
 from paddle.distributed.fleet.utils.tensor_fusion_helper import (
+    HOOK_ACTION,
+    FusedCommBuffer,
     assign_group_by_size,
 )
 
-from .pp_utils.utils import HOOK_ACTION, FusedCommBuffer
-
 __all__ = []
 
 g_shard_use_reduce = int(os.environ.get("FLAGS_shard_use_reduce", 0))
@@ -334,9 +334,11 @@ def register_allreduce_overlap_hook(self, model, comm_group, acc_steps, dp):
 
             for dst in fused_parameter_group:
                 parameter_list = fused_parameter_group[dst]
-                if not dp:
+                if act != HOOK_ACTION.ALL_REDUCE:
                     # parse the relative dst rank to absolute dst rank for sharding
                     dst = comm_group.ranks[dst]
+                else:
+                    dst = -1
                 var_groups = assign_group_by_size(parameter_list)
                 for group_idx, parameters in var_groups.items():
                     buffer = FusedCommBuffer(
@@ -515,7 +517,7 @@ def forward_backward_pipeline(
         if self._comm_overlap:
             assert len(self._comm_buffers) > 0
             for buffer in self._comm_buffers:
-                buffer.scale_and_split_grads()
+                buffer.scale_grads()
 
         if self._enable_timer:
             self.timers("allreduce_shared_weight_gradients").start()
@@ -1256,7 +1258,7 @@ def forward_backward_pipeline(
             if self._comm_overlap:
                 assert len(self._comm_buffers) > 0
                 for buffer in self._comm_buffers:
-                    buffer.scale_and_split_grads()
+                    buffer.scale_grads()
 
             if static_scheduler:
                 self._reset_counter()
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 6c8e2fd9dc3aa..33b8c3d95d582 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -15,19 +15,10 @@
 
 import paddle
 from paddle import _legacy_C_ops
-from paddle.distributed.fleet.utils.tensor_fusion_helper import (
-    flatten_dense_tensors,
-)
-from paddle.framework import base as imperative_base
 
 __all__ = []
 
 
-class HOOK_ACTION:
-    ALL_REDUCE = 0
-    REDUCE = 1
-
-
 FLOAT_TYPE_DICT = {
     paddle.float16: "float16",
     paddle.float32: "float32",
@@ -116,118 +107,3 @@ def _all_gather(tensor, group=None, use_calc_stream=True):
         'nranks',
         nranks,
     )
-
-
-class FusedCommBuffer:
-    def __init__(self, id, params, comm_group, acc_steps=1, act=None, dst=-1):
-        self._id = id
-        self._params = params
-        self._acc_steps = acc_steps
-        self._comm_group = comm_group
-
-        self.use_main_grad = hasattr(self._params[0], "main_grad")
-
-        self._task = None
-        self._params_step_dict = {}
-        self._params_checked_in = 0
-        self._params_to_addr = {}
-
-        self._act = act
-        if self._act == HOOK_ACTION.ALL_REDUCE:
-            assert dst == -1
-        elif self._act == HOOK_ACTION.REDUCE:
-            assert dst != -1
-        else:
-            raise ValueError(
-                "The act should be allreudce for dp or reduce for sharding."
-            )
-        self._dst = dst
-
-        self._init_step_dict()
-
-        self.grad_storage = flatten_dense_tensors(
-            self._params,
-            use_main_grad=self.use_main_grad,
-            fuse_param=False,
-            warp_buffer=False,
-        ).buffer
-
-        self._record_addr()
-
-    def _record_addr(self):
-        for param in self._params:
-            addr = (
-                param.main_grad.data_ptr()
-                if self.use_main_grad
-                else param.grad.data_ptr()
-            )
-            self._params_to_addr[param.name] = addr
-
-    def _init_step_dict(self):
-        for p in self._params:
-            self._params_step_dict[p.name] = 0
-
-    def _reset_params_checked_in(self):
-        self._task = None
-        self._init_step_dict()
-        self._params_checked_in = 0
-
-    @property
-    def _all_params_checked_in(self):
-        return (
-            len(self._params) == self._params_checked_in
-            and len(self._params_step_dict) == 0
-        )
-
-    def add_grad(self, param):
-        assert param.name in self._params_step_dict
-        current_ptr = (
-            param.main_grad.data_ptr()
-            if self.use_main_grad
-            else param.grad.data_ptr()
-        )
-        if self._params_to_addr[param.name] != current_ptr:
-            raise ValueError(
-                "The address of the grad/main_grad of the param has been changed during training, "
-                "which is not allowed for dp/sharding overlap with pp. "
-                "This may be caused by some non-inplace operations on the grad/main_grad. "
-                "Please use the inplace version of the operations or disable the overlapping."
-            )
-
-        self._params_step_dict[param.name] += 1
-
-        if self._params_step_dict[param.name] == self._acc_steps:
-            self._params_checked_in += 1
-            self._params_step_dict.pop(param.name)
-
-        if self._all_params_checked_in:
-            self._comm_grads()
-
-    @imperative_base.no_grad
-    def _comm_grads(self):
-        assert self._all_params_checked_in
-
-        if self._act == HOOK_ACTION.ALL_REDUCE:
-            task = paddle.distributed.all_reduce(
-                self.grad_storage, group=self._comm_group, sync_op=False
-            )
-
-        elif self._act == HOOK_ACTION.REDUCE:
-            task = paddle.distributed.reduce(
-                self.grad_storage,
-                dst=self._dst,
-                group=self._comm_group,
-                sync_op=False,
-            )
-
-        self._task = task
-
-    @imperative_base.no_grad
-    def scale_and_split_grads(self):
-        assert self._task is not None
-        self._task.wait()
-
-        scale_factor = 1.0 / self._comm_group.nranks
-        self.grad_storage.scale_(scale_factor)
-
-        self._reset_params_checked_in()
diff --git a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
index 403f9d5d9a6c1..f2720b04ea093 100644
--- a/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
+++ b/python/paddle/distributed/fleet/utils/tensor_fusion_helper.py
@@ -12,13 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import itertools
+import os
 from collections import OrderedDict
 
 import numpy as np
 
 import paddle
+from paddle.framework import base as imperative_base
 from paddle.framework import core
 
+
+class HOOK_ACTION:
+    ALL_REDUCE = 0
+    REDUCE = 1
+
+
 alignment = {
     "gpu": 256,
 }
@@ -101,23 +109,204 @@ def flatten_dense_tensors(
         return grad_storage
 
 
-def obtain_storage(parameters, use_main_grad, clip, dist):
+def bw_hook_func(buffer, param):
+    @paddle.autograd.no_grad()
+    def fused_comm(*_):
+        buffer.add_grad(param)
+
+    return fused_comm
+
+
+class FusedCommBuffer:
+    def __init__(
+        self,
+        id,
+        params,
+        comm_group,
+        acc_steps=1,
+        act=None,
+        dst=-1,
+        use_main_grad=None,
+        fuse_param=False,
+        scale_after_comm=True,
+    ):
+        self._id = id
+        self._params = params
+        self._acc_steps = acc_steps
+        self._comm_group = comm_group
+        self._scale_after_comm = scale_after_comm
+        self._fuse_param = fuse_param
+
+        self.use_main_grad = (
+            use_main_grad
+            if use_main_grad is not None
+            else hasattr(self._params[0], "main_grad")
+        )
+
+        self._task = None
+        self._params_step_dict = {}
+        self._params_checked_in = 0
+        self._grads_to_addr = {}
+
+        self._act = act
+        if self._act == HOOK_ACTION.ALL_REDUCE:
+            assert dst == -1
+        elif self._act == HOOK_ACTION.REDUCE:
+            assert dst != -1
+        else:
+            raise ValueError(
+                "The act should be allreudce for dp or reduce for sharding."
+            )
+        self._dst = dst
+
+        self._init_step_dict()
+
+        if self._fuse_param:
+            self.param_storage, self.grad_storage = flatten_dense_tensors(
+                self._params,
+                use_main_grad=use_main_grad,
+                fuse_param=True,
+                warp_buffer=True,
+            )
+            self.param_storage = self.param_storage.buffer
+            self.grad_storage = self.grad_storage.buffer
+        else:
+            self.param_storage = None
+            self.grad_storage = flatten_dense_tensors(
+                self._params,
+                use_main_grad=self.use_main_grad,
+                fuse_param=False,
+                warp_buffer=False,
+            ).buffer
+
+        self._record_addr()
+
+    def _record_addr(self):
+        for param in self._params:
+            addr = (
+                param.main_grad.data_ptr()
+                if self.use_main_grad
+                else param.grad.data_ptr()
+            )
+            self._grads_to_addr[param.name] = addr
+
+    def _init_step_dict(self):
+        for p in self._params:
+            self._params_step_dict[p.name] = 0
+
+    def _reset_params_checked_in(self):
+        self._task = None
+        self._init_step_dict()
+        self._params_checked_in = 0
+
+    @property
+    def _all_params_checked_in(self):
+        return (
+            len(self._params) == self._params_checked_in
+            and len(self._params_step_dict) == 0
+        )
+
+    def add_grad(self, param):
+        assert param.name in self._params_step_dict
+        current_ptr = (
+            param.main_grad.data_ptr()
+            if self.use_main_grad
+            else param.grad.data_ptr()
+        )
+        if self._grads_to_addr[param.name] != current_ptr:
+            raise ValueError(
+                "The address of the grad/main_grad of the param has been changed during training, "
+                "which is not allowed for dp/sharding overlap with pp. "
+                "This may be caused by some non-inplace operations on the grad/main_grad. "
+                "Please use the inplace version of the operations or disable the overlapping."
+            )
+
+        self._params_step_dict[param.name] += 1
+
+        if self._params_step_dict[param.name] == self._acc_steps:
+            self._params_checked_in += 1
+            self._params_step_dict.pop(param.name)
+
+        if self._all_params_checked_in:
+            self._comm_grads()
+
+    @imperative_base.no_grad
+    def _comm_grads(self):
+        assert self._all_params_checked_in
+
+        if not self._scale_after_comm:
+            scale_factor = 1.0 / self._comm_group.nranks
+            self.grad_storage.scale_(scale_factor)
+
+        if self._act == HOOK_ACTION.ALL_REDUCE:
+            task = paddle.distributed.all_reduce(
+                self.grad_storage, group=self._comm_group, sync_op=False
+            )
+
+        elif self._act == HOOK_ACTION.REDUCE:
+            task = paddle.distributed.reduce(
+                self.grad_storage,
+                dst=self._dst,
+                group=self._comm_group,
+                sync_op=False,
+            )
+
+        self._task = task
+
+    @imperative_base.no_grad
+    def scale_grads(self):
+        assert self._task is not None
+        self._task.wait()
+
+        if self._scale_after_comm:
+            scale_factor = 1.0 / self._comm_group.nranks
+            self.grad_storage.scale_(scale_factor)
+
+        self._reset_params_checked_in()
+
+
+def obtain_storage(
+    parameters,
+    use_main_grad=False,
+    clip=True,
+    dist=False,
+    fuse_param=True,
+    comm_overlap=False,
+    act=None,
+    comm_group=None,
+    dst=-1,
+    acc_steps=1,
+    scale_after_comm=False,
+):
     if len(parameters) < 1:
-        return []
+        return [], []
 
     var_groups = assign_group_by_size(parameters, group_size=256 * 1024 * 1024)
     storage = []
+    buffers = []
     for group_idx, parameters in var_groups.items():
-        param_storage, grad_storage = flatten_dense_tensors(
+        comm_buffer = FusedCommBuffer(
+            group_idx,
             parameters,
+            comm_group=comm_group,
+            acc_steps=acc_steps,
+            act=act,
+            dst=dst,
             use_main_grad=use_main_grad,
-            fuse_param=True,
-            warp_buffer=True,
+            fuse_param=fuse_param,
+            scale_after_comm=scale_after_comm,
         )
-        param_storage.buffer.need_clip = clip
-        param_storage.buffer.is_distributed = dist
-        storage.append(param_storage.buffer)
-    return storage
+        if fuse_param:
+            param_buffer = comm_buffer.param_storage
+            param_buffer.need_clip = clip
+            param_buffer.is_distributed = dist
+            storage.append(param_buffer)
+        if comm_overlap:
+            for param in parameters:
+                param._register_backward_hook(bw_hook_func(comm_buffer, param))
+            buffers.append(comm_buffer)
+
+    return storage, buffers
 
 
 def filter_params(params, is_fp32, is_distributed, need_clip):
@@ -155,7 +344,38 @@ def filter_params(params, is_fp32, is_distributed, need_clip):
     return params, dtype
 
 
-def fused_parameters(parameters, use_main_grad):
+def fused_parameters(
+    parameters,
+    use_main_grad=False,
+    fuse_param=True,
+    comm_overlap=False,
+    comm_group=None,
+    dst=-1,
+    acc_step=1,
+    scale_after_comm=False,
+):
+    """
+    Fuse gradients. Fuse parameters if be enabled. Prepare for comm overlap if be enabled.
+    :param parameters: all parameters to be fused.
+    :param use_main_grad: does the gradient use main grad or not
+    :param comm_overlap: enable comm overlap or not
+    :param comm_group: the comm group for comm overlap
+    :param dst: the dst for comm overlap
+    :param acc_step: acc steps, using for comm overlap
+    :param fuse_param: fuse param or not
+    :param scale_after_comm: if enable comm overlap, specify the location of grad scale
+    :return: param storage if fused, comm buffers is comm overlap
+    """
+    g_shard_use_reduce = int(os.environ.get("FLAGS_shard_use_reduce", 0))
+    act = (
+        HOOK_ACTION.ALL_REDUCE if not g_shard_use_reduce else HOOK_ACTION.REDUCE
+    )
+    if comm_overlap:
+        assert comm_group is not None
+    if act == HOOK_ACTION.REDUCE:
+        assert dst != -1
+    elif act == HOOK_ACTION.ALL_REDUCE:
+        dst = -1
     param_groups = []
     attrs = []
 
@@ -178,6 +398,7 @@ def fused_parameters(parameters, use_main_grad):
 
     decay_fused = []
     all_fused = []
+    all_buffers = []
     for params, attr in zip(param_groups, attrs):
         decay_params = []
         other_params = []
@@ -190,14 +411,36 @@ def fused_parameters(parameters, use_main_grad):
 
         is_distributed = attr[1]
         need_clip = attr[2]
-        decay = obtain_storage(
-            decay_params, use_main_grad, need_clip, is_distributed
+        decay, decay_buffers = obtain_storage(
+            decay_params,
+            use_main_grad=use_main_grad,
+            clip=need_clip,
+            dist=is_distributed,
+            fuse_param=fuse_param,
+            comm_overlap=comm_overlap,
+            act=act,
+            comm_group=comm_group,
+            dst=dst,
+            acc_steps=acc_step,
+            scale_after_comm=scale_after_comm,
         )
-        other = obtain_storage(
-            other_params, use_main_grad, need_clip, is_distributed
+        other, other_buffers = obtain_storage(
+            other_params,
+            fuse_param=fuse_param,
+            comm_overlap=comm_overlap,
+            use_main_grad=use_main_grad,
+            clip=need_clip,
+            dist=is_distributed,
+            act=act,
+            comm_group=comm_group,
+            dst=dst,
+            acc_steps=acc_step,
+            scale_after_comm=scale_after_comm,
         )
         decay_fused += decay
         all_fused += decay
         all_fused += other
+        all_buffers += decay_buffers
+        all_buffers += other_buffers
 
-    return decay_fused, all_fused
+    return decay_fused, all_fused, all_buffers
diff --git a/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py b/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py
index 310313119b4c3..e70656a4ce608 100644
--- a/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py
+++ b/test/collective/fleet/hybrid_parallel_sharding_model_with_fusion.py
@@ -99,6 +99,8 @@ def setUp(self):
             "pp_degree": 1,
         }
         self.strategy.hybrid_configs["sharding_configs"].tensor_fusion = True
+        self.strategy.hybrid_configs["sharding_configs"].comm_overlap = True
+        self.strategy.hybrid_configs["sharding_configs"].accumulate_steps = 1
         fleet.init(is_collective=True, strategy=self.strategy)
         self.data = np.random.randint(
             0,
diff --git a/test/legacy_test/test_fused_comm_buffer.py b/test/legacy_test/test_fused_comm_buffer.py
index ad771b6dfe5a2..25d9a2748bd0e 100644
--- a/test/legacy_test/test_fused_comm_buffer.py
+++ b/test/legacy_test/test_fused_comm_buffer.py
@@ -15,7 +15,7 @@
 import unittest
 
 import paddle
-from paddle.distributed.fleet.meta_parallel.pp_utils.utils import (
+from paddle.distributed.fleet.utils.tensor_fusion_helper import (
     HOOK_ACTION,
     FusedCommBuffer,
 )

From f6161d1e180d76d099c59ab6d13663c2b86bbbc7 Mon Sep 17 00:00:00 2001
From: Yichen Zhang <32740647+pkuzyc@users.noreply.github.com>
Date: Mon, 24 Jul 2023 16:56:24 +0800
Subject: [PATCH 13/34] [Semi-Auto] Add transpose spmd rule (#55350)

* [Semi-Auto] Add transpose spmd rule

* add unit test in cmake file

* log perm info
---
 .../auto_parallel/spmd_rules/common.h         |   2 +-
 .../auto_parallel/spmd_rules/rules.h          |   4 +
 .../spmd_rules/transpose_spmd_rule.cc         | 103 ++++++++++++
 .../spmd_rules/transpose_spmd_rule.h          |  40 +++++
 test/auto_parallel/spmd_rules/CMakeLists.txt  |   1 +
 .../spmd_rules/test_transpose_rule.py         | 154 ++++++++++++++++++
 6 files changed, 303 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc
 create mode 100644 paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h
 create mode 100644 test/auto_parallel/spmd_rules/test_transpose_rule.py

diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h
index 35fb67938ee22..f5a49ab0a9f18 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h
@@ -77,7 +77,7 @@ class SPMDRuleBase {
     PADDLE_ENFORCE_NE(iter,
                       attrs.end(),
                       paddle::platform::errors::NotFound(
-                          "(%s) is not found in AttributeMap."));
+                          "(%s) is not found in AttributeMap.", name));
     return iter->second;
   }
 };
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
index bba4339198021..713a52770926d 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -24,6 +24,7 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/softmax_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h"
+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h"
 
 // TODO(ljz) Automatic this process in cmake file.
 namespace paddle {
@@ -155,6 +156,9 @@ REGISTER_SPMD_RULE(softmax_with_cross_entropy, CrossEntropyWithSoftmaxSPMDRule);
 REGISTER_SPMD_RULE(split, SplitSPMDRule);
 REGISTER_SPMD_RULE(split_with_num, SplitSPMDRule);
 
+// transpose rule
+REGISTER_SPMD_RULE(transpose, TransposeSPMDRule);
+
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc
new file mode 100644
index 0000000000000..fe567e70fa019
--- /dev/null
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+using phi::distributed::auto_parallel::str_join;
+std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+TransposeSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
+                                const paddle::framework::AttributeMap& attrs) {
+  // step0: Verify Input Args Based on Transpose Logic
+  int64_t ninputs = input_specs.size();
+  PADDLE_ENFORCE_EQ(
+      ninputs,
+      1,
+      phi::errors::InvalidArgument("The size of InputSpec in transpose must "
+                                   "be equal to 1, but got [%d].",
+                                   ninputs));
+  VerifySpecs(input_specs, "transpose");
+
+  // step1: Build Einsum Notation
+  std::vector<int64_t> perm_dims =
+      ExtractAttr<std::vector<int64_t>>("perm", attrs);
+  std::string alphabet = "abcdefghijklmnopqrstuvwxyz";
+
+  // get einsum notation for input
+  int64_t ndim = input_specs[0].shape().size();
+  std::vector<std::string> input_axes_vec;
+  std::string input_axes = alphabet.substr(0, ndim);
+  input_axes_vec.emplace_back(input_axes);
+
+  // get einsum notation for output
+  for (int64_t i = 0, n = perm_dims.size(); i < n; ++i) {
+    // convert the negative dim value to normal dim value
+    if (perm_dims[i] < 0) {
+      perm_dims[i] = ndim + perm_dims[i];
+    }
+  }
+  std::string output_axes = "";
+  for (int64_t i = 0; i < ndim; i++) {
+    output_axes.append(1, input_axes[perm_dims[i]]);
+  }
+
+  // step2: Sharding Propogation
+  // step2.1: merge input shardings
+  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
+  axes_sharding_info = GetAxesDimsMappingPair(input_axes_vec, input_specs);
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(axes_sharding_info);
+
+  // step2.2: infer output dimsmapping from merged input dimsmapping
+  std::vector<int64_t> output_dims_mapping =
+      GetDimsMappingForAxes(output_axes, axis_to_dim_map);
+
+  // initialize output dist_attr's process_mesh, batch_dim and dynamic dims with
+  // input dist_attr.
+  TensorDistAttr output_dist_attr =
+      CopyTensorDistAttrForOutput(input_specs[0].dist_attr());
+  output_dist_attr.set_dims_mapping(output_dims_mapping);
+
+  // Step2.3  handle input tensor partial (TODO)
+  VLOG(4) << "TransposeSPMDRule InferForward:";
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(input_specs[i].shape()) << "] "
+            << "src_dims_mapping: [" << str_join(input_specs[i].dims_mapping())
+            << "] "
+            << "perm: [" << str_join(perm_dims) << "] "
+            << "dst_dims_mapping: [" << str_join(input_specs[i].dims_mapping())
+            << "]";
+  }
+  VLOG(4) << "Output dims_mapping: [" + str_join(output_dims_mapping) + "]\n\n";
+
+  return {{input_specs[0].dist_attr()}, {output_dist_attr}};
+}
+
+std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+TransposeSPMDRule::InferBackward(
+    const std::vector<DistTensorSpec>& output_specs,
+    const paddle::framework::AttributeMap& attrs) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "InferBackward of TransposeSPMDRule is NOT implemented yet."));
+
+  return {};
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h
new file mode 100644
index 0000000000000..b047932036a71
--- /dev/null
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/transpose_spmd_rule.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iterator>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/common.h"
+
+namespace paddle {
+namespace distributed {
+namespace auto_parallel {
+
+class TransposeSPMDRule : public SPMDRuleBase {
+ public:
+  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+  InferForward(const std::vector<DistTensorSpec>& input_specs,
+               const paddle::framework::AttributeMap& attrs) override;
+
+  std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
+  InferBackward(const std::vector<DistTensorSpec>& output_specs,
+                const paddle::framework::AttributeMap& attrs) override;
+};
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace paddle
diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt
index 1da9d4674c381..43afd9aed75e7 100644
--- a/test/auto_parallel/spmd_rules/CMakeLists.txt
+++ b/test/auto_parallel/spmd_rules/CMakeLists.txt
@@ -9,6 +9,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_matmul_rule MODULES test_replicated_rule)
   py_test_modules(test_matmul_rule MODULES test_softmax_rule)
   py_test_modules(test_split_rule MODULES test_split_rule)
+  py_test_modules(test_transpose_rule MODULES test_transpose_rule)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/spmd_rules/test_transpose_rule.py b/test/auto_parallel/spmd_rules/test_transpose_rule.py
new file mode 100644
index 0000000000000..62c86c3cf3f38
--- /dev/null
+++ b/test/auto_parallel/spmd_rules/test_transpose_rule.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.distributed.fleet import auto
+
+
+class TestTransposeSPMDRule(unittest.TestCase):
+    """
+    Unit tests for reduction spmd rule.
+    """
+
+    def setUp(self):
+        self.rule = get_spmd_rule("transpose")
+
+        x_shape = [64, 36]
+        process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [1, 0]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        self.attrs = {
+            'perm': [0, 1, 2, 3],
+        }
+
+    def test_single_mesh_dim(self):
+        # perm = [1, 0]
+        # [0, -1] --> [0, -1], [-1, 0]
+        self.attrs['perm'] = [1, 0]
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [-1, 0])
+
+        # perm = [0, 1]
+        # [0, -1] --> [0, -1], [0, -1]
+        self.attrs['perm'] = [0, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1])
+
+        # perm = [0, 2, 3, 1]
+        # [-1, -1, 0, -1] --> [-1, -1, 0, -1], [-1, 0, -1, -1]
+        self.x_dist_tensor_spec.shape = [64, 48, 36, 24]
+        self.attrs['perm'] = [0, 2, 3, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, 0, -1, -1]
+        )
+
+    def test_multi_mesh_dim(self):
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+        self.x_dist_tensor_spec.set_process_mesh(process_mesh)
+        self.x_dist_tensor_spec.shape = [64, 48, 36, 24]
+
+        # perm = [0, 2, 3, 1]
+        # [-1, 0, 1, -1] --> [-1, 0, 1, -1], [-1, 1, -1, 0]
+        self.attrs['perm'] = [0, 2, 3, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 1)
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, 0, 1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, 1, -1, 0]
+        )
+
+        # perm = [0, 2, 3, 1]
+        # [-1, -1, -1, -1] --> [-1, -1, -1, -1], [-1, -1, -1, -1]
+        self.attrs['perm'] = [0, 2, 3, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, -1, -1, -1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1, -1]
+        )
+
+        # perm = [-1, 0, -2, 1]
+        # [-1, -1, 0, 1] --> [-1, -1, 0, 1], [1, -1, 0, -1]
+        self.attrs['perm'] = [-1, 0, -2, 1]
+        self.x_dist_tensor_spec.set_dims_mapping([-1, -1, 0, 1])
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_dist_tensor_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, -1, 0, 1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [1, -1, 0, -1]
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0f0dfe9aee46a042c3af05c3cade3af881ff2ba9 Mon Sep 17 00:00:00 2001
From: Windfarer <windfarer@gmail.com>
Date: Mon, 24 Jul 2023 17:54:05 +0800
Subject: [PATCH 14/34] [Bug Fix] convert environment variables' types (#55586)

---
 .../distributed/launch/context/__init__.py    |  5 +-
 .../distributed/launch/context/args_envs.py   | 48 +++++++++----------
 2 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index b252e966021bc..9083347a51158 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -97,8 +97,9 @@ def continous_log(self) -> bool:
 
     def set_env_in_args(self):
         for k, v in env_args_mapping.items():
+            attr, attr_type = v
             if k in self.envs:
                 print(
-                    f"LAUNCH WARNNING args {v} is override by env {self.envs[k]}"
+                    f"LAUNCH WARNNING args {attr} will be overridden by env: {k} value: {self.envs[k]}"
                 )
-                setattr(self.args, v, self.envs[k])
+                setattr(self.args, attr, attr_type(self.envs[k]))
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index 7dc410de3450d..56eac96f1b8b9 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -17,30 +17,30 @@
 from distutils.util import strtobool
 
 env_args_mapping = {
-    'POD_IP': 'host',
-    'PADDLE_MASTER': 'master',
-    'PADDLE_DEVICES': 'devices',
-    'PADDLE_NNODES': 'nnodes',
-    'PADDLE_RUN_MODE': 'run_mode',
-    'PADDLE_LOG_LEVEL': 'log_level',
-    'PADDLE_LOG_OVERWRITE': 'log_overwrite',
-    'PADDLE_SORT_IP': 'sort_ip',
-    'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
-    'PADDLE_JOB_ID': 'job_id',
-    'PADDLE_RANK': 'rank',
-    'PADDLE_LOG_DIR': 'log_dir',
-    'PADDLE_MAX_RESTART': 'max_restart',
-    'PADDLE_ELASTIC_LEVEL': 'elastic_level',
-    'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
-    'PADDLE_SERVER_NUM': 'server_num',
-    'PADDLE_TRAINER_NUM': 'trainer_num',
-    'PADDLE_SERVERS_ENDPOINTS': 'servers',
-    'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
-    'PADDLE_GLOO_PORT': 'gloo_port',
-    'PADDLE_WITH_GLOO': 'with_gloo',
-    'PADDLE_START_PORT': 'start_port',
-    'PADDLE_IPS': 'ips',
-    "PADDLE_AUTO_PARALLEL_CONFIG": 'auto_parallel_config',
+    'POD_IP': ('host', str),
+    'PADDLE_MASTER': ('master', str),
+    'PADDLE_DEVICES': ('devices', str),
+    'PADDLE_NNODES': ('nnodes', str),
+    'PADDLE_RUN_MODE': ('run_mode', str),
+    'PADDLE_LOG_LEVEL': ('log_level', str),
+    'PADDLE_LOG_OVERWRITE': ('log_overwrite', strtobool),
+    'PADDLE_SORT_IP': ('sort_ip', strtobool),
+    'PADDLE_NPROC_PER_NODE': ('nproc_per_node', int),
+    'PADDLE_JOB_ID': ('job_id', str),
+    'PADDLE_RANK': ('rank', int),
+    'PADDLE_LOG_DIR': ('log_dir', str),
+    'PADDLE_MAX_RESTART': ('max_restart', int),
+    'PADDLE_ELASTIC_LEVEL': ('elastic_level', int),
+    'PADDLE_ELASTIC_TIMEOUT': ('elastic_timeout', int),
+    'PADDLE_SERVER_NUM': ('server_num', int),
+    'PADDLE_TRAINER_NUM': ('trainer_num', int),
+    'PADDLE_SERVERS_ENDPOINTS': ('servers', str),
+    'PADDLE_TRAINERS_ENDPOINTS': ('trainers', str),
+    'PADDLE_GLOO_PORT': ('gloo_port', int),
+    'PADDLE_WITH_GLOO': ('with_gloo', str),
+    'PADDLE_START_PORT': ('start_port', int),
+    'PADDLE_IPS': ('ips', str),
+    "PADDLE_AUTO_PARALLEL_CONFIG": ('auto_parallel_config', str),
 }
 
 

From 38fbbe6be5fa0a5e6beaebb214636f1c67540d96 Mon Sep 17 00:00:00 2001
From: jjyaoao <jjyaoao@126.com>
Date: Mon, 24 Jul 2023 18:18:12 +0800
Subject: [PATCH 15/34] =?UTF-8?q?=E4=BF=AE=E6=94=B9COPY-FROM=20No.13=20dis?=
 =?UTF-8?q?tributed=20(#55236)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: jjyaoao <jjyaoao@126.com>
---
 .../distributed/fleet/base/role_maker.py      | 41 +++++++++++++++++++
 .../distributed/fleet/dataset/dataset.py      |  8 ++--
 python/paddle/distributed/fleet/fleet.py      | 29 ++++++++-----
 3 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 113a0132f4c12..0b09fd082ba41 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -546,6 +546,30 @@ def _barrier(self, comm_world):
 
 
 class PaddleCloudRoleMaker(RoleMakerBase):
+
+    """
+    PaddleCloudRoleMaker is an interface for distributed configuration initialization based on obtaining distributed related information from environment variables.
+
+    Examples:
+        .. code-block:: python
+
+            import os
+            import paddle.distributed.fleet as fleet
+
+            os.environ["PADDLE_PSERVER_NUMS"] = "2"
+            os.environ["PADDLE_TRAINERS_NUM"] = "2"
+
+            os.environ["POD_IP"] = "127.0.0.1"
+            os.environ["PADDLE_PORT"] = "36001"
+            os.environ["TRAINING_ROLE"] = "PSERVER"
+            os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+
+            os.environ["PADDLE_TRAINER_ID"] = "0"
+
+            fleet.PaddleCloudRoleMaker(is_collective=False)
+
+    """
+
     def __init__(self, is_collective=False, **kwargs):
         super().__init__()
         self._is_collective = is_collective
@@ -1184,6 +1208,23 @@ def _generate_role(self):
 
 
 class UserDefinedRoleMaker(PaddleCloudRoleMaker):
+
+    """
+    UserDefinedRoleMaker is an interface for distributed configuration initialization based on obtaining distributed related information from user-defined parameters.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            from paddle.distributed.fleet.base.role_maker import Role
+
+            fleet.UserDefinedRoleMaker(
+                current_id=0,
+                role=Role.SERVER,
+                worker_num=2,
+                server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
+    """
+
     def __init__(self, is_collective=False, init_gloo=False, **kwargs):
         super().__init__(
             is_collective=is_collective, init_gloo=init_gloo, **kwargs
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index a28d439f7ecd5..8cdd57a34248e 100755
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -1279,10 +1279,11 @@ class QueueDataset(DatasetBase):
     QueueDataset, it will process data streamly.
 
     Examples:
-        .. code-block:: python
 
-          import paddle
-          dataset = paddle.distributed.QueueDataset()
+    .. code-block:: python
+
+    import paddle
+    dataset = paddle.distributed.QueueDataset()
 
     """
 
@@ -1298,6 +1299,7 @@ def init(self, **kwargs):
         :api_attr: Static Graph
 
         should be called only once in user's python scripts to initialize setings of dataset instance
+
         """
         super().init(**kwargs)
 
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index 860e0a6ce5013..2dab355264b4d 100755
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -105,10 +105,11 @@ class Fleet:
     Returns:
         Fleet: A Fleet instance
 
-    Example for collective training:
 
         .. code-block:: python
+            :name: code-example1
 
+            # Example1: for collective training
             import paddle
             paddle.enable_static()
             import paddle.distributed.fleet as fleet
@@ -122,10 +123,11 @@ class Fleet:
             # do distributed training
 
 
-    Example for parameter server training:
 
         .. code-block:: python
+            :name: code-example2
 
+            # Example2: for parameter server training
             import paddle
             paddle.enable_static()
             import paddle.distributed.fleet as fleet
@@ -195,40 +197,39 @@ def init(
         Returns:
             None
 
-        Examples1:
+        Examples:
 
             .. code-block:: python
+                :name: code-example1
 
                 import paddle.distributed.fleet as fleet
                 fleet.init()
 
-        Examples2:
+
 
             .. code-block:: python
+                :name: code-example2
 
                 import paddle.distributed.fleet as fleet
                 fleet.init(is_collective=True)
 
-        Examples3:
 
             .. code-block:: python
-
+                :name: code-example3
                 import paddle.distributed.fleet as fleet
                 role = fleet.PaddleCloudRoleMaker()
                 fleet.init(role)
 
-        Examples4:
 
             .. code-block:: python
-
+                :name: code-example4
                 import paddle.distributed.fleet as fleet
                 strategy = fleet.DistributedStrategy()
                 fleet.init(strategy=strategy)
 
-        Examples5:
 
             .. code-block:: python
-
+                :name: code-example5
                 import paddle.distributed.fleet as fleet
                 strategy = fleet.DistributedStrategy()
                 fleet.init(log_level = "DEBUG")
@@ -627,6 +628,14 @@ def barrier_worker(self):
 
         Returns:
             None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+                fleet.barrier_worker()
         """
         self._role_maker._barrier("worker")
 

From 76530a2a061137a365f036f1ac8537fc1eec558e Mon Sep 17 00:00:00 2001
From: JYChen <zoooo0820@qq.com>
Date: Mon, 24 Jul 2023 18:31:12 +0800
Subject: [PATCH 16/34] add IndexPutGradInfermeta to fix backward error in
 static-mode (#55602)

* add IndexPutGradInfermeta to fix backward error in static-mode

* codestyle
---
 paddle/phi/api/yaml/backward.yaml     |  3 +--
 paddle/phi/infermeta/backward.cc      | 15 ++++++++++++
 paddle/phi/infermeta/backward.h       |  7 ++++++
 test/legacy_test/test_index_put_op.py | 33 +++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 465df08392d91..924a5f59d5055 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1092,8 +1092,7 @@
   args : (Tensor x, Tensor[] indices, Tensor value, Tensor out_grad, bool accumulate=false)
   output : Tensor(x_grad), Tensor(value_grad)
   infer_meta :
-    func : GeneralBinaryGradInferMeta
-    param : [x, value]
+    func : IndexPutGradInferMeta
   kernel :
     func : index_put_grad
     data_type : out_grad
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index b028fd15b1b93..d1078e2d176bc 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -1202,6 +1202,21 @@ void IndexAddGradInferMeta(const MetaTensor& index,
   }
 }
 
+void IndexPutGradInferMeta(const MetaTensor& x,
+                           const std::vector<const MetaTensor*>& indices,
+                           const MetaTensor& value,
+                           const MetaTensor& out_grad,
+                           bool accumulate,
+                           MetaTensor* x_grad,
+                           MetaTensor* value_grad) {
+  if (x_grad) {
+    x_grad->share_meta(x);
+  }
+  if (value_grad) {
+    value_grad->share_meta(value);
+  }
+}
+
 void FusedRopeGradInferMeta(const MetaTensor& dout_q,
                             const MetaTensor& dout_k,
                             const MetaTensor& dout_v,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index cb923e16446af..c73e5ab7a4d9e 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -467,4 +467,11 @@ void IndexAddGradInferMeta(const MetaTensor& index,
                            MetaTensor* x_grad,
                            MetaTensor* add_tensor_grad);
 
+void IndexPutGradInferMeta(const MetaTensor& x,
+                           const std::vector<const MetaTensor*>& indices,
+                           const MetaTensor& value,
+                           const MetaTensor& out_grad,
+                           bool accumulate,
+                           MetaTensor* x_grad,
+                           MetaTensor* value_grad);
 }  // namespace phi
diff --git a/test/legacy_test/test_index_put_op.py b/test/legacy_test/test_index_put_op.py
index c4bf5d6f0fd40..f21f7b084bde4 100644
--- a/test/legacy_test/test_index_put_op.py
+++ b/test/legacy_test/test_index_put_op.py
@@ -854,6 +854,39 @@ def test_backwardBroadCastValue2(self):
                 atol=1e-7,
             )
 
+    def test_backward_in_static(self):
+        paddle.enable_static()
+        exe = paddle.static.Executor()
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(train_program, startup_program):
+            x = paddle.zeros((4, 2, 5))
+            x.stop_gradient = False
+
+            y = x + 1
+            index = paddle.to_tensor([0, 1, 3])
+
+            value = paddle.ones((5,))
+            value.stop_gradient = False
+
+            z = paddle.index_put(y, (index,), value)
+            l = z.sum()
+            paddle.static.append_backward(l)
+            res = exe.run(fetch_list=[z, x.grad_name, value.grad_name])
+
+            expected_z = np.ones((4, 2, 5))
+            expected_z[[0, 1, 3]] = np.ones((5,))
+
+            expected_x_grad = np.ones((4, 2, 5))
+            expected_x_grad[[0, 1, 3]] = 0
+
+            expected_v_grad = np.ones((5,)) * 3 * 2
+
+            np.testing.assert_allclose(expected_z, res[0])
+            np.testing.assert_allclose(expected_x_grad, res[1])
+            np.testing.assert_allclose(expected_v_grad, res[2])
+        paddle.disable_static()
+
 
 class TestIndexPutAPIMixedIndices(TestIndexPutAPIBase):
     def init_dtype_type(self):

From a3cf25e302acd8b81a4bdb83e4606efef2208470 Mon Sep 17 00:00:00 2001
From: chen <103103266+ckl117@users.noreply.github.com>
Date: Mon, 24 Jul 2023 19:39:56 +0800
Subject: [PATCH 17/34] [Paddle-TRT] Convert 0D tensor to 1D tensor, increase
 the shape tensor's number count when collecting shape (#55503)

* make 0-D tensor to 1-D tensor to support Grounding-SAM and add shape check

* recover identity_op_clean_pass.cc
---
 .../fluid/inference/api/analysis_predictor.cc |  4 +--
 .../tensorrt/convert/elementwise_op.cc        |  2 +-
 .../inference/tensorrt/convert/op_converter.h | 29 +++++++++++++++++--
 paddle/fluid/inference/tensorrt/engine.cc     |  5 ++++
 paddle/fluid/inference/tensorrt/engine.h      |  4 +++
 .../operators/tensorrt/tensorrt_engine_op.h   | 12 ++++++++
 6 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 44bd09e86ada9..e1c8410b9db35 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2236,10 +2236,10 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
 
     // We need collect value range for shape tensor for Paddle-TRT's use.
     // To be noticed, this method to identify all shape tensors is based on
-    // assumption that all shape tensors in the model have numbers <= 7.
+    // assumption that all shape tensors in the model have numbers <= 8.
     // This is a simple method to identify all shape tensors with some
     // mistakes, but it doesn't matter.
-    auto is_shape_tensor = tensor.numel() <= 7 && tensor.numel() >= 1;
+    auto is_shape_tensor = tensor.numel() <= 8 && tensor.numel() >= 1;
     if ((tensor.dtype() == phi::DataType::INT32 ||
          tensor.dtype() == phi::DataType::INT64) &&
         is_shape_tensor) {
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 252b57551e526..81e175fac5ea2 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -30,7 +30,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     nvinfer1::ITensor* Y = nullptr;
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
-    if (Y_v) {
+    if (Y_v && !engine_->with_dynamic_shape()) {
       // Y is weight
       auto* Y_t = Y_v->GetMutable<phi::DenseTensor>();
       std::vector<int> dims_y = phi::vectorize<int>(Y_t->dims());
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index e2991a2c708d8..b07b20bb82121 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -308,9 +308,15 @@ class OpConverter {
       auto var_shape = var->GetShape();
       if (engine->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
-        auto min_input_shape = engine->min_input_shape()[input];
-        auto max_input_shape = engine->max_input_shape()[input];
-        auto optim_input_shape = engine->optim_input_shape()[input];
+        if (!(engine->min_input_shape().count(input) &&
+              engine->max_input_shape().count(input) &&
+              engine->optim_input_shape().count(input))) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Cannot get %s min/max/opt shape", input));
+        }
+        auto min_input_shape = engine->min_input_shape().at(input);
+        auto max_input_shape = engine->max_input_shape().at(input);
+        auto optim_input_shape = engine->optim_input_shape().at(input);
         size_t ranks = min_input_shape.size();
 
         std::vector<int64_t> input_shape;
@@ -732,6 +738,23 @@ class OpConverter {
       layer_name += output_tensor_names[i];
       if (i != num_out - 1) layer_name += ", ";
     }
+    for (size_t i = 0; i < num_out; i++) {
+      nvinfer1::Dims tmp_dims = layer->getOutput(i)->getDimensions();
+      std::vector<int> tmp_vec;
+      for (int i = 0; i < tmp_dims.nbDims; i++)
+        tmp_vec.push_back(tmp_dims.d[i]);
+
+      VLOG(3) << output_tensor_names[i] << "'s dimension :["
+              << string::join_strings(tmp_vec, ',') << "]";
+      // The following check may cause errors in CI, but is necessary in the
+      // latest version.
+      // PADDLE_ENFORCE_GE(
+      //     layer->getOutput(i)->getDimensions().nbDims,
+      //     0,
+      //     platform::errors::InvalidArgument(
+      //         "Error occures in Paddle-TRT layer with output name: %s",
+      //         output_tensor_names[i].c_str()));
+    }
     layer->setName((layer_name + ")").c_str());
   }
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 62b7957332508..6e9beb810f0bd 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -532,6 +532,11 @@ nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor(
   for (int64_t i = 0; i < trt_in_shape.nbDims; i++) {
     trt_in_shape.d[i] = var_dims[i];
   }
+  // Make 0-D tensor to 1-D tensor.
+  if (trt_in_shape.nbDims == 0) {
+    trt_in_shape.nbDims = 1;
+    trt_in_shape.d[0] = 1;
+  }
   // In fact , this is not always right, because we can't determine if the 0th
   // dimension is batch. Just for run chenqu's model
   if (!this->with_dynamic_shape()) {
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 1f69bbfba097b..303778b588968 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -524,6 +524,10 @@ class TensorRTEngine {
     for (const auto& it : runtime_input_shape) {
       auto name = it.first;
       auto input_shape = it.second;
+      // Make 0-D tensor to 1-D tensor.
+      if (input_shape.size() == 0) {
+        input_shape.push_back(1);
+      }
       bool min_change = false;
       bool max_change = false;
       std::vector<int> bak_min_shape;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index d6679ea26f8f7..f05ffe620c73c 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -594,6 +594,18 @@ class TensorRTEngineOp : public framework::OperatorBase {
         t.ShareDataWith(out);
       }
       auto t_shape = phi::vectorize<int64_t>(t.dims());
+
+      // This must be a zero dimension tensor.
+      // At present, we convert it to a 1D tensor to feed them into Trt.
+      if (t_shape.size() == 0) {
+        PADDLE_ENFORCE_EQ(
+            t.numel(),
+            1UL,
+            platform::errors::PreconditionNotMet(
+                "This tensor must have one element, but got %ld.", t.numel()));
+        t_shape.push_back(1);
+      }
+
       // Get index of profile 0 first, then plus binding offset
       const int bind_index =
           engine->engine()->getBindingIndex(x.c_str()) + binding_offset;

From ae2d8ba157540b39a4d7ab897c030217a33e82cb Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 24 Jul 2023 21:01:05 +0800
Subject: [PATCH 18/34] [AutoParallel] Simplify DistTensor namespace path
 (#55593)

* simplify dist tensor namespace path

* fix tensor dist attr decl error
---
 paddle/fluid/pybind/eager.cc                             | 2 +-
 paddle/fluid/pybind/eager_method.cc                      | 4 ++--
 paddle/fluid/pybind/eager_properties.cc                  | 5 ++---
 paddle/fluid/pybind/eager_utils.cc                       | 2 +-
 paddle/fluid/pybind/eager_utils.h                        | 2 +-
 paddle/fluid/pybind/tensor.cc                            | 2 +-
 paddle/phi/api/lib/tensor.cc                             | 2 +-
 paddle/phi/core/dense_tensor.h                           | 4 +---
 paddle/phi/core/distributed/auto_parallel/dist_tensor.cc | 2 --
 paddle/phi/core/distributed/auto_parallel/dist_tensor.h  | 6 +++---
 paddle/phi/core/utils/type_info.cc                       | 3 +--
 11 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 316a19728c19b..42d8911bdafa0 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -45,7 +45,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
-using phi::distributed::auto_parallel::DistTensor;
+using phi::distributed::DistTensor;
 using phi::distributed::auto_parallel::TensorDistAttr;
 #endif
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 6a3f7e09c202a..6d07363805f8f 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -801,8 +801,8 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
     return ToPyObject(tensor);
   } else if (self->tensor.is_dist_tensor()) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-    auto* tensor = static_cast<phi::distributed::auto_parallel::DistTensor*>(
-        self->tensor.impl().get());
+    auto* tensor =
+        static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
     VLOG(6) << "dist tensor: " << tensor->defined();
     return ToPyObject(tensor);
 #else
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 42c5b97067b0e..42d53ad7bee01 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -164,9 +164,8 @@ PyObject* tensor_properties_get_dist_attr(TensorObject* self, void* closure) {
   EAGER_TRY
   if (self->tensor.is_dist_tensor()) {
 #ifdef PADDLE_WITH_DISTRIBUTE
-    phi::distributed::auto_parallel::DistTensor* dist_tensor =
-        static_cast<phi::distributed::auto_parallel::DistTensor*>(
-            self->tensor.impl().get());
+    phi::distributed::DistTensor* dist_tensor =
+        static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
     return ToPyObject(dist_tensor->dist_attr().get());
 #else
     RETURN_PY_NONE
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index e365819928e66..8dfc7cfc8e426 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -859,7 +859,7 @@ PyObject* ToPyObject(const phi::DenseTensor* value) {
 }
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-PyObject* ToPyObject(const phi::distributed::auto_parallel::DistTensor* value) {
+PyObject* ToPyObject(const phi::distributed::DistTensor* value) {
   auto obj = ::pybind11::cast(value, py::return_value_policy::reference);
   obj.inc_ref();
   return obj.ptr();
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 208d2f25e7d21..1fb53a3b9f7a6 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -113,7 +113,7 @@ PyObject* ToPyObject(const std::vector<std::vector<paddle::Tensor>>& value,
 PyObject* ToPyObject(const platform::Place& value);
 PyObject* ToPyObject(const phi::DenseTensor* value);
 #ifdef PADDLE_WITH_DISTRIBUTE
-PyObject* ToPyObject(const phi::distributed::auto_parallel::DistTensor* value);
+PyObject* ToPyObject(const phi::distributed::DistTensor* value);
 PyObject* ToPyObject(
     const phi::distributed::auto_parallel::TensorDistAttr* value);
 #endif
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 98ae45dd0134b..e9ad190ea3f3c 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -1025,7 +1025,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 #endif
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-  using phi::distributed::auto_parallel::DistTensor;
+  using phi::distributed::DistTensor;
   py::class_<DistTensor>(m, "DistTensor")
       .def(
           "get_tensor",
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 40319fa9ba660..b835230e22978 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -133,7 +133,7 @@ bool Tensor::is_dense_tensor() const {
 }
 bool Tensor::is_dist_tensor() const {
 #ifdef PADDLE_WITH_DISTRIBUTE
-  return phi::distributed::auto_parallel::DistTensor::classof(impl_.get());
+  return phi::distributed::DistTensor::classof(impl_.get());
 #else
   return false;
 #endif
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 2cfdd7493c438..8af8f745baff7 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -30,9 +30,7 @@ namespace phi {
 
 class DenseTensorUtils;
 namespace distributed {
-namespace auto_parallel {
 class DistTensor;
-}  // namespace auto_parallel
 }  // namespace distributed
 
 /// \brief The Dense tensor stores values in a contiguous sequential block
@@ -186,7 +184,7 @@ class DenseTensor : public TensorBase,
 
  private:
   friend class DenseTensorUtils;
-  friend class phi::distributed::auto_parallel::DistTensor;
+  friend class phi::distributed::DistTensor;
 
  protected:
   DenseTensorMeta meta_;
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
index 6f60773132656..b234fc3c17485 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -16,7 +16,6 @@
 
 namespace phi {
 namespace distributed {
-namespace auto_parallel {
 
 void* DistTensor::AllocateFrom(Allocator* allocator,
                                DataType dtype,
@@ -59,6 +58,5 @@ void DistTensor::set_meta(const DenseTensorMeta& meta) {
   meta_ = meta;
 }
 
-}  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
index ed47727fe9a3a..eb3a6dbbe3e66 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
@@ -18,11 +18,12 @@
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
-
 namespace distributed {
-namespace auto_parallel {
 
+namespace auto_parallel {
 class TensorDistAttr;
+}
+using auto_parallel::TensorDistAttr;
 
 class DistTensor final
     : public phi::TensorBase,
@@ -125,6 +126,5 @@ class DistTensor final
   std::unique_ptr<DenseTensor> value_{nullptr};
 };
 
-}  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 2a554525024c8..0de0289887507 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -56,8 +56,7 @@ template class TypeInfoTraits<phi::DeviceContext, CPUContext>;
 template class TypeInfoTraits<phi::DeviceContext, CustomContext>;
 
 #ifdef PADDLE_WITH_DISTRIBUTE
-template class TypeInfoTraits<phi::TensorBase,
-                              phi::distributed::auto_parallel::DistTensor>;
+template class TypeInfoTraits<phi::TensorBase, phi::distributed::DistTensor>;
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \

From a7567cd0e0b9541075bbdccb5dc0a13bca024dbc Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Tue, 25 Jul 2023 09:59:42 +0800
Subject: [PATCH 19/34] add vjp interface (#55660)

---
 paddle/fluid/ir/interface/interface.cc |  2 +
 paddle/fluid/ir/interface/vjp.h        | 59 ++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 paddle/fluid/ir/interface/vjp.h

diff --git a/paddle/fluid/ir/interface/interface.cc b/paddle/fluid/ir/interface/interface.cc
index 442be02e2f235..ce43e44782867 100644
--- a/paddle/fluid/ir/interface/interface.cc
+++ b/paddle/fluid/ir/interface/interface.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/ir/interface/infermeta.h"
 #include "paddle/fluid/ir/interface/op_yaml_info.h"
+#include "paddle/fluid/ir/interface/vjp.h"
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::InferMetaInterface)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OpYamlInfoInterface)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::VjpInterface)
diff --git a/paddle/fluid/ir/interface/vjp.h b/paddle/fluid/ir/interface/vjp.h
new file mode 100644
index 0000000000000..dec58f54af7e2
--- /dev/null
+++ b/paddle/fluid/ir/interface/vjp.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/ir/core/op_base.h"
+
+namespace paddle {
+namespace dialect {
+class VjpInterface : public ir::OpInterfaceBase<VjpInterface> {
+ public:
+  struct Concept {
+    explicit Concept(std::vector<std::vector<ir::Value>> (*vjp)(
+        std::vector<std::vector<ir::Value>> out_grads,
+        const std::vector<std::vector<int>>& stop_gradients))
+        : vjp_(vjp) {}
+    std::vector<std::vector<ir::Value>> (*vjp_)(
+        std::vector<std::vector<ir::Value>> out_grads,
+        const std::vector<std::vector<int>>& stop_gradients);
+  };
+
+  template <class ConcreteOp>
+  struct Model : public Concept {
+    static std::vector<std::vector<ir::Value>> Vjp(
+        std::vector<std::vector<ir::Value>> out_grads,
+        const std::vector<std::vector<int>>& stop_gradients) {
+      return ConcreteOp::Vjp(out_grads, stop_gradients);
+    }
+
+    Model() : Concept(Vjp) {}
+  };
+
+  VjpInterface(ir::Operation* op, Concept* impl)
+      : ir::OpInterfaceBase<VjpInterface>(op), impl_(impl) {}
+
+  std::vector<std::vector<ir::Value>> Vjp(
+      std::vector<std::vector<ir::Value>> out_grads,
+      const std::vector<std::vector<int>>& stop_gradients) {
+    return impl_->vjp_(out_grads, stop_gradients);
+  }
+
+ private:
+  Concept* impl_;
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::VjpInterface)

From 14094aad5c663d0bd387ef36c33173bc6faa8da6 Mon Sep 17 00:00:00 2001
From: jiangfan06 <117341294+MuShangCC@users.noreply.github.com>
Date: Tue, 25 Jul 2023 10:22:34 +0800
Subject: [PATCH 20/34] [XPU] Add FP16 support for arg_min_max (#55642)

---
 paddle/phi/backends/xpu/xpu2_op_list.cc      |  5 ++++-
 paddle/phi/kernels/xpu/arg_min_max_kernel.cc | 13 ++++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index c146108917eeb..acbcc81b12fb2 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -36,7 +36,10 @@ XPUOpMap& get_kl2_ops() {
       {"adam_dense_param_sparse_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"arg_max", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"arg_max",
+       XPUKernelSet({phi::DataType::INT32,
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
       {"argsort_grad",
        XPUKernelSet({phi::DataType::INT32,
                      phi::DataType::INT64,
diff --git a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
index 39f04df6139fa..2b637e9da09e8 100644
--- a/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
+++ b/paddle/phi/kernels/xpu/arg_min_max_kernel.cc
@@ -35,6 +35,7 @@ void ArgMaxKernel(const Context& dev_ctx,
                   bool flatten,
                   int dtype,
                   DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
   PADDLE_ENFORCE_EQ(
       (dtype < 0 || dtype == ARG_MAX_OUTPUT_DATATYPE_INT32 ||
        dtype == ARG_MAX_OUTPUT_DATATYPE_INT64),
@@ -69,7 +70,7 @@ void ArgMaxKernel(const Context& dev_ctx,
       return;
     }
     r = xpu::argmax(dev_ctx.x_context(),
-                    x.data<T>(),
+                    reinterpret_cast<const XPUType*>(x.data<T>()),
                     out->data<int64_t>(),
                     xdims_vec,
                     axis_val);
@@ -90,7 +91,7 @@ void ArgMaxKernel(const Context& dev_ctx,
                     static_cast<int64_t>(0));
     } else {
       r = xpu::argmax(dev_ctx.x_context(),
-                      x.data<T>(),
+                      reinterpret_cast<const XPUType*>(x.data<T>()),
                       out_int64.data<int64_t>(),
                       xdims_vec,
                       axis_val);
@@ -116,6 +117,12 @@ void ArgMaxKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PD_REGISTER_KERNEL(argmax, XPU, ALL_LAYOUT, phi::ArgMaxKernel, float) {
+PD_REGISTER_KERNEL(argmax,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ArgMaxKernel,
+                   float,
+                   int,
+                   phi::dtype::float16) {
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }

From ca72aa2ac25c0427677ae1eb85a00993b2b6fd0c Mon Sep 17 00:00:00 2001
From: jiangfan06 <117341294+MuShangCC@users.noreply.github.com>
Date: Tue, 25 Jul 2023 10:28:33 +0800
Subject: [PATCH 21/34] Fix reduce_ops for mixed-precision FP16 support
 (#55573)

---
 paddle/fluid/framework/ir/auto_mixed_precision_pass.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 6e12cf00e903b..6139ccb313788 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -415,7 +415,8 @@ void AutoMixedPrecisionPass::GetOpPrecision() const {
           auto out_dtype = op_node->Op()->GetAttrIfExists<int>("out_dtype");
           support_low_precision =
               support_low_precision &&
-              IsFP32AndFP64(static_cast<VarType::Type>(out_dtype));
+              (IsFP32AndFP64(static_cast<VarType::Type>(out_dtype)) ||
+               out_dtype == -1);
         }
 
         // If scale op's "scale" and "bias" attr value exceed the range of fp16

From 05a40691e2d1de7244140ab486cf2fe32bfc5729 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Tue, 25 Jul 2023 10:31:13 +0800
Subject: [PATCH 22/34] [0D-Tensor] Fix test_elementwise_max_op unittest
 (#55674)

---
 test/legacy_test/test_elementwise_max_op.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/test/legacy_test/test_elementwise_max_op.py b/test/legacy_test/test_elementwise_max_op.py
index 40aeaa50a0e1c..2d202f1c0843a 100644
--- a/test/legacy_test/test_elementwise_max_op.py
+++ b/test/legacy_test/test_elementwise_max_op.py
@@ -124,9 +124,6 @@ def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype("float64")
         self.y = np.random.uniform(0.1, 1, []).astype("float64")
 
-    def if_enbale_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMaxFP16Op_ZeroDim1(TestElementwiseFP16Op):
     def init_data(self):
@@ -142,9 +139,6 @@ def init_data(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype("float64")
         self.y = np.random.uniform(0.1, 1, []).astype("float64")
 
-    def if_enbale_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMaxFP16Op_ZeroDim2(TestElementwiseFP16Op):
     def init_data(self):
@@ -160,9 +154,6 @@ def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype("float64")
         self.y = np.random.uniform(0.1, 1, [13, 17]).astype("float64")
 
-    def if_enbale_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMaxFP16Op_ZeroDim3(TestElementwiseFP16Op):
     def init_data(self):

From 03a2f1878cc37efabe55e3dbdf9c08f80019c0e1 Mon Sep 17 00:00:00 2001
From: qiuwenbo <qiuwenbogdut@163.com>
Date: Tue, 25 Jul 2023 10:34:28 +0800
Subject: [PATCH 23/34] =?UTF-8?q?=E8=A7=A3=E5=86=B3=20grad=5Ffn=20next=5Ff?=
 =?UTF-8?q?unctions=20api=20=E6=8E=A5=E5=8F=A3=E5=AF=BC=E8=87=B4=E5=86=85?=
 =?UTF-8?q?=E5=AD=98=E5=BC=82=E5=B8=B8=E7=9A=84=E9=97=AE=E9=A2=98=20-=20?=
 =?UTF-8?q?=20(#55627)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [尝试] 给tensor增加一个属性, 这个属性是一个定值 1

* 暴露gradnode 并构建gradnode新的方法(用来测试)进行暴露给python python端可以访问

* 开发grad_fn、next_functions两个API 并暴露到python端- 做一些规范化处理

* 增加一个单元测试

* 优化 code-style

* 将单侧文件迁到正确的位置

* 优化 code-style

* 删除无用注释

* 解决 __main__ has no attribute

* 修改单侧文件

* 修改单侧脚本-temp

* 解决 grad_fn next_functions api 接口导致内存异常的问题

* 修改单测内容

* 解决 code-style 问题
---
 paddle/fluid/pybind/eager_properties.cc       |  9 ++++---
 paddle/fluid/pybind/eager_utils.cc            |  5 ++--
 paddle/fluid/pybind/eager_utils.h             |  2 +-
 paddle/fluid/pybind/pybind.cc                 | 24 ++++++++++++++-----
 .../test_grad_fn_and_next_functions.py        |  5 ++++
 5 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 42d53ad7bee01..2a7692ee99bb3 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -317,17 +317,16 @@ PyObject* tensor_properties_get_grad_fn(TensorObject* self, void* closure) {
 
   if (meta) {
     // Get the GradNode from meta
-    auto grad_node = meta->GradNode();  // Convert GradNode to a Python object
-    // The conversion will depend on the structure of GradNode.
-
-    if (!grad_node) {
+    auto grad_node_ptr = meta->GetMutableGradNode();
+    if (!grad_node_ptr) {
       Py_INCREF(Py_None);
       return Py_None;
     }
 
-    PyObject* py_grad_node = ToPyObject(grad_node);
+    PyObject* py_grad_node = ToPyObject(grad_node_ptr);
 
     return py_grad_node;
+
   } else {
     // If meta does not exist, return an appropriate Python object (e.g., None
     // or a special value).
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 8dfc7cfc8e426..ee270042f4176 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1006,10 +1006,9 @@ paddle::optional<paddle::Tensor> GetOptionalTensorFromArgs(
   }
 }
 
-PyObject* ToPyObject(egr::GradNodeBase* grad_node) {
+PyObject* ToPyObject(std::shared_ptr<egr::GradNodeBase> grad_node) {
   py::object py_obj = py::cast(grad_node, py::return_value_policy::reference);
-  py::handle py_handle = py::handle(py_obj);
-  PyObject* py_grad_node = py_handle.ptr();
+  PyObject* py_grad_node = py_obj.release().ptr();
   Py_INCREF(py_grad_node);
   return py_grad_node;
 }
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 1fb53a3b9f7a6..f50ec9395b2f1 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -126,7 +126,7 @@ PyObject* ToPyObject(
     const std::unordered_map<std::string, std::vector<std::string>>& value);
 PyObject* ToPyObject(const paddle::framework::Vocab& value);
 
-PyObject* ToPyObject(egr::GradNodeBase* grad_node);
+PyObject* ToPyObject(std::shared_ptr<egr::GradNodeBase> grad_node);
 
 class PyTensorHook : public egr::TensorHook {
  public:
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d55cab98b1eba..504e1adf22569 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -778,12 +778,24 @@ PYBIND11_MODULE(libpaddle, m) {
           }
         });
 
-  py::class_<egr::GradNodeBase>(m, "GradNodeBase")
-      .def("name", &egr::GradNodeBase::name)
-      .def_property_readonly("next_functions",
-                             &egr::GradNodeBase::NextFunctions)
-      .def("input_meta", &egr::GradNodeBase::InputMeta)
-      .def("output_meta", &egr::GradNodeBase::OutputMeta);
+  py::class_<egr::GradNodeBase, std::shared_ptr<egr::GradNodeBase>>(
+      m, "GradNodeBase")
+      .def("name",
+           [](const std::shared_ptr<egr::GradNodeBase> &self) {
+             return self->name();
+           })
+      .def_property_readonly(
+          "next_functions",
+          [](const std::shared_ptr<egr::GradNodeBase> &self) {
+            return self->NextFunctions();
+          })
+      .def("input_meta",
+           [](const std::shared_ptr<egr::GradNodeBase> &self) {
+             return self->InputMeta();
+           })
+      .def("output_meta", [](const std::shared_ptr<egr::GradNodeBase> &self) {
+        return self->OutputMeta();
+      });
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("cudnn_version", &platform::DnnVersion);
diff --git a/test/legacy_test/test_grad_fn_and_next_functions.py b/test/legacy_test/test_grad_fn_and_next_functions.py
index 5464775001253..531cdfa98a070 100644
--- a/test/legacy_test/test_grad_fn_and_next_functions.py
+++ b/test/legacy_test/test_grad_fn_and_next_functions.py
@@ -83,6 +83,11 @@ def check_func(self, grad_fn, grad_fn_json) -> None:
             grad_fn_json (dict): grad_node_json of node
         """
         self.assertEqual(grad_fn.name(), grad_fn_json["func_name"])
+        # Recursively test other nodes
+        if hasattr(grad_fn, 'next_functions') and grad_fn.next_functions[0]:
+            next_funcs_json = grad_fn_json["next_funcs"]
+            for u in grad_fn.next_functions:
+                self.check_func(u, next_funcs_json[u.name()])
 
 
 if __name__ == "__main__":

From 98c7a3e0bb50025b98b82a95f6580882b9334fa8 Mon Sep 17 00:00:00 2001
From: kangguangli <kangguangli@hotmail.com>
Date: Tue, 25 Jul 2023 10:47:28 +0800
Subject: [PATCH 24/34] [BugFix] fix random fail of test_bilinear_interp_v2_op
 (#55643)

* fix random fail of test_bilinear_interp_v2_op

* reset if compiledProgram
---
 test/legacy_test/eager_op_test.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py
index 838e8de35235c..489d8ce02d7dd 100644
--- a/test/legacy_test/eager_op_test.py
+++ b/test/legacy_test/eager_op_test.py
@@ -1211,13 +1211,21 @@ def _check_ir_output(self, place, program, feed_map, fetch_list, outs):
             return
 
         set_flags({"FLAGS_enable_new_ir_in_executor": True})
-
+        new_scope = paddle.static.Scope()
         executor = Executor(place)
+        new_program = None
+        if isinstance(program, paddle.static.CompiledProgram):
+            new_program = fluid.CompiledProgram(
+                program._program, build_strategy=program._build_strategy
+            )
+        else:
+            new_program = program.clone()
         ir_outs = executor.run(
-            program,
+            new_program,
             feed=feed_map,
             fetch_list=fetch_list,
             return_numpy=False,
+            scope=new_scope,
         )
         assert len(outs) == len(
             ir_outs

From 057202574bf2c4e15d4bb5f4d313488af3870202 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Tue, 25 Jul 2023 10:57:27 +0800
Subject: [PATCH 25/34] Call multiply_ instead of scale_ to avoid multiple DtoH
 copy. (#55589)

* Call multiply_ instead of scale_ to avoid multiple DtoH copy.

* Call _squared_l2_norm to calculate grad_clip.

* Fix import error.
---
 .../dygraph_optimizer/hybrid_parallel_optimizer.py       | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index b24247b580766..cef28af4d4795 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -103,8 +103,7 @@ def _dygraph_clip(self, params_grads):
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                 merge_grad = clip.merge_selected_rows(g)
                 merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
-            square = paddle.square(merge_grad)
-            sum_square = paddle.sum(square)
+            sum_square = clip._squared_l2_norm(merge_grad)
 
             not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or (
                 hasattr(p, 'is_firstly_shared')
@@ -230,15 +229,15 @@ def _dygraph_clip(self, params_grads):
             if getattr(p, 'need_clip', True) is False:
                 continue
             if g.dtype == paddle.float16:
-                g.scale_(clip_var_fp16)
+                g.multiply_(clip_var_fp16)
             elif g.dtype == paddle.bfloat16:
                 if paddle.is_compiled_with_xpu():
                     raise NotImplementedError(
                         "BF16 is not supported on XPU now"
                     )
-                g.scale_(clip_var_bf16)
+                g.multiply_(clip_var_bf16)
             else:
-                g.scale_(clip_var)
+                g.multiply_(clip_var)
             p._reset_grad_inplace_version(True)
 
         return params_grads

From fb9bec5d78fbd8d6500055f77b302dd63afc4ccf Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Tue, 25 Jul 2023 13:35:14 +0800
Subject: [PATCH 26/34] [NewIR]new ir dygraph to static supoort gpu (#55620)

* add kernel dialect

* change DenseTensorTypeStorage to DenseTensorType

* add test case`

* add first pd_op to kernel dialect

* lower pd op to kernel dialect

* update

* update

* remove useless code

* add attrite print test

* fix bug

* update

* update

* update

* update

* polish code

* fix bug

* polish  code  and add python test

* add test

* fix test error

* relax constraint when inserting get_parameter

* add env flag

* fix bug

* dygraph2static support new ir

* fix bug

* revert test env

* change cc_test_old to cc_test

* update

* fix build_static bug

* update test

* fix type test error

* udpate cmake

* disable test in windows

* fix inference compile

* fix program translator error

* only run on cpu, not support gpu yet

* fix conflict

* polish code

* fix bug

* add feed with place op

* update

* remove useless unitest

* udpate mkldnn

* update

* update

* align mkldnn version

* new ir support builtin slice op

* fix bug

* fix phi kernel adaptor bug

* add enable static

* add enable_static

* remove useless test case

* change feed list to single variable

* update

* add feed with place and shaddow output op

* fix bug

* remove usless code

* support gpu

* fix bug

* fix bug

* remove template

* add more data type

* fix cimpile bug

* udpate

* remove useless code

* revert dygraph2st test

* remove usless code

* revert op

* fix bug

* new ir dygraph2static support gpu

* remove usless code

* code polish

* add const

* revert code and remove useless code

* revert code

* revert legacy op yaml

* remove useless code

* delete std::move

---------

Co-authored-by: kangguangli <kangguangli@hotmail.com>
---
 .../eager/to_static/run_program_op_node.h     |  59 +++++--
 paddle/fluid/framework/CMakeLists.txt         |   3 +-
 paddle/fluid/framework/executor_cache.cc      | 167 +++++++++++++++++-
 paddle/fluid/framework/executor_cache.h       |  26 ++-
 .../interpreter/interpreter_util.cc           |   3 +-
 paddle/fluid/framework/tensor_util.cc         |   2 +-
 .../ir/phi_kernel_adaptor/phi_kernel_util.cc  |  30 +++-
 .../ir/transforms/pd_op_to_kernel_pass.cc     |  14 ++
 .../ir_adaptor/translator/op_translator.cc    |  62 +++++++
 .../translator/program_translator.cc          |   8 +
 paddle/phi/api/yaml/op_compat.yaml            |   7 +
 paddle/phi/api/yaml/ops.yaml                  |  22 +++
 paddle/phi/api/yaml/static_ops.yaml           |  12 --
 .../phi/kernels/cpu/feed_with_place_kernel.cc |   8 +
 paddle/phi/kernels/feed_with_place_kernel.h   |   6 +
 test/ir/new_ir/test_feed_with_place.py        |   1 +
 test/ir/new_ir/test_standalone_new_ir.py      |  65 ++++++-
 17 files changed, 459 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 574adc1f9d4d4..a8e47953f65dd 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -19,12 +19,16 @@
 #include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/fluid/operators/run_program_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/ir/core/program.h"
 #include "paddle/ir/core/value.h"
 
+PHI_DECLARE_bool(enable_new_ir_in_executor);
+
 namespace details {
 using Tensor = paddle::Tensor;
 
@@ -367,16 +371,32 @@ inline void RunProgramAPI(
     details::ShareTensorsIntoScope(x, global_inner_scope);
     details::ShareTensorsIntoScope(params, global_inner_scope);
     // Step 2. create new interpretercore
-    interpreter_core =
-        paddle::framework::CreateInterpreterCoreInfoToCache(*forward_program,
-                                                            place,
-                                                            /*is_grad=*/false,
-                                                            program_id,
-                                                            global_inner_scope);
+
+    if (FLAGS_enable_new_ir_in_executor) {
+      // build new ir program
+      auto ir_program = paddle::framework::ConstructFowardIrProgram(
+          forward_global_block, backward_global_block, output_names, x);
+      interpreter_core =
+          paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+              std::move(ir_program),
+              place,
+              /*is_grad=*/false,
+              program_id,
+              global_inner_scope);
+    } else {
+      interpreter_core =
+          paddle::framework::CreateProgramInterpreterCoreInfoToCache(
+              *forward_program,
+              place,
+              /*is_grad=*/false,
+              program_id,
+              global_inner_scope);
+    }
     // Step 3. get all eager gc vars
     std::set<std::string> skip_eager_delete_vars =
         paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
             *backward_program);
+
     // all out_vars are skip_eager_var
     skip_eager_delete_vars.insert(output_names.begin(), output_names.end());
     skip_eager_delete_vars.insert(dout_names.begin(), dout_names.end());
@@ -504,12 +524,27 @@ inline void RunProgramGradAPI(
         1);
     VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
     details::ShareTensorsIntoScope(out_grad, global_inner_scope);
-    interpreter_core =
-        paddle::framework::CreateInterpreterCoreInfoToCache(*backward_program,
-                                                            place,
-                                                            /*is_grad=*/true,
-                                                            program_id,
-                                                            global_inner_scope);
+
+    if (FLAGS_enable_new_ir_in_executor) {
+      auto res = paddle::framework::ConstructBackwardIrProgram(
+          backward_global_block, out_grad, x_grad, params_grad);
+
+      interpreter_core =
+          paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+              std::move(res),
+              place,
+              /*is_grad=*/true,
+              program_id,
+              global_inner_scope);
+    } else {
+      interpreter_core =
+          paddle::framework::CreateProgramInterpreterCoreInfoToCache(
+              *backward_program,
+              place,
+              /*is_grad=*/true,
+              program_id,
+              global_inner_scope);
+    }
 
     // share threadpool
     // NOTE(zhiqiu): this only works interpreter_core is executed strictly
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 4137518cf69d4..41b681afb5400 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1033,7 +1033,8 @@ cc_library(
 cc_library(
   executor_cache
   SRCS executor_cache.cc
-  DEPS parallel_executor standalone_executor)
+  DEPS parallel_executor standalone_executor phi_kernel_adaptor
+       pd_op_to_kernel_pass ir)
 if(WITH_PSCORE)
   get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
   if(WITH_HETERPS)
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 9e8f4a25873d1..506ce36e47242 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/executor_cache.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/ir/core/program.h"
 #include "paddle/ir/core/value.h"
 
@@ -288,7 +290,7 @@ InterpreterCoreInfoCache &InterpreterCoreInfoCache::Instance() {
   return g_info_cache;
 }
 
-std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
+std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
     const ProgramDesc &program_desc,
     const platform::Place &place,
     bool is_grad,
@@ -304,13 +306,172 @@ std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
   interpreter::ExecutionConfig execution_config;
   execution_config.create_local_scope = false;
   execution_config.used_for_jit = true;
-  auto core = std::make_shared<InterpreterCore>(
-      place, program_desc.Block(0), scope, execution_config);
+
+  std::shared_ptr<InterpreterCore> core = nullptr;
+
+  core.reset(new InterpreterCore(
+      place, program_desc.Block(0), scope, execution_config));
+
+  auto &cached_value =
+      interpretercore_info_cache.GetMutable(program_id, is_grad);
+  cached_value.core_ = core;
+  return core;
+}
+
+std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
+    std::unique_ptr<::ir::Program> ir_program,
+    const platform::Place &place,
+    bool is_grad,
+    int64_t program_id,
+    framework::Scope *scope) {
+  auto &interpretercore_info_cache =
+      framework::InterpreterCoreInfoCache::Instance();
+  if (interpretercore_info_cache.Size() > 10u /* max_cached_size*/) {
+    VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear "
+               "all cache!";
+    interpretercore_info_cache.Finalize();
+  }
+  interpreter::ExecutionConfig execution_config;
+  execution_config.create_local_scope = false;
+  execution_config.used_for_jit = true;
+
+  std::shared_ptr<InterpreterCore> core = nullptr;
+
+  core.reset(new InterpreterCore(
+      place, std::move(ir_program), scope, execution_config));
+
   auto &cached_value =
       interpretercore_info_cache.GetMutable(program_id, is_grad);
   cached_value.core_ = core;
   return core;
 }
 
+std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
+    const paddle::framework::BlockDesc *forward_global_block,
+    const paddle::framework::BlockDesc *backward_global_block,
+    const std::vector<std::string> output_names,
+    const std::vector<paddle::Tensor> &x) {
+  auto ir_ctx = ::ir::IrContext::Instance();
+  auto program = std::make_unique<::ir::Program>(ir_ctx);
+
+  std::set<std::string> set_output_names;
+  auto local_program =
+      paddle::framework::ProgramDesc(*(forward_global_block->Program()));
+
+  for (auto op_desc : local_program.Block(0).AllOps()) {
+    for (const auto &n : op_desc->Outputs()) {
+      const auto &input_var_names = n.second;
+      for (const auto &var_name : input_var_names) {
+        set_output_names.insert(var_name);
+      }
+    }
+  }
+
+  // add fetch with place op to program
+  for (auto &in_t : x) {
+    auto name = in_t.name();
+    auto place = in_t.place().GetType();
+
+    auto op_desc = local_program.MutableBlock(0)->PrependOp();
+    op_desc->SetType("feed_with_place");
+    op_desc->SetAttr("index", 0);
+    // TODO(phlrain) : using tensor dtype
+    op_desc->SetAttr("dtype", 0);
+    op_desc->SetAttr("place", static_cast<int>(place));
+    op_desc->SetAttr("name", name);
+    op_desc->SetOutput("out", {name});
+  }
+
+  std::set<std::string> set_parameter_names;
+  for (auto op_desc : backward_global_block->Program()->Block(0).AllOps()) {
+    for (const auto &n : op_desc->Inputs()) {
+      const auto &input_var_names = n.second;
+      for (const auto &var_name : input_var_names) {
+        set_parameter_names.insert(var_name);
+      }
+    }
+  }
+
+  for (auto &t : output_names) {
+    set_parameter_names.insert(t);
+  }
+
+  for (auto &name : set_parameter_names) {
+    if (!set_output_names.count(name)) {
+      continue;
+    }
+
+    auto op_desc = local_program.MutableBlock(0)->AppendOp();
+    op_desc->SetType("shaddow_output");
+    op_desc->SetAttr("name", name);
+    op_desc->SetInput("x", {name});
+    op_desc->SetOutput("out", {"@EMPTY@"});
+  }
+
+  paddle::translator::ProgramTranslator program_translator(&local_program,
+                                                           program.get());
+
+  program_translator.Translate();
+
+  auto ir_res = paddle::dialect::PdOpLowerToKernelPass(program.get());
+
+  return ir_res;
+}
+
+std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
+    const paddle::framework::BlockDesc *backward_global_block,
+    const std::vector<paddle::Tensor> &out_grad,
+    const std::vector<paddle::Tensor *> &x_grad,
+    const std::vector<paddle::Tensor *> &params_grad) {
+  auto ir_ctx = ::ir::IrContext::Instance();
+  auto program = std::make_unique<::ir::Program>(ir_ctx);
+
+  auto local_program =
+      paddle::framework::ProgramDesc(*(backward_global_block->Program()));
+  // add feed kernel
+  for (auto &out_grad_t : out_grad) {
+    auto name = out_grad_t.name();
+    auto place = out_grad_t.place().GetType();
+    if (name == "@EMPTY@") {
+      continue;
+    }
+    auto op_desc = local_program.MutableBlock(0)->PrependOp();
+    op_desc->SetType("feed_with_place");
+    op_desc->SetAttr("index", 0);
+    // TODO(phlrain) : using tensor dtype
+    op_desc->SetAttr("dtype", 0);
+    op_desc->SetAttr("place", static_cast<int>(place));
+    op_desc->SetAttr("name", name);
+    op_desc->SetOutput("out", {name});
+  }
+
+  std::vector<std::string> param_grad_names;
+  for (auto &p_g : params_grad) {
+    param_grad_names.push_back(p_g->name());
+  }
+
+  for (auto &t : x_grad) {
+    param_grad_names.push_back(t->name());
+  }
+  for (auto &name : param_grad_names) {
+    if (name == "@EMPTY@") {
+      continue;
+    }
+    auto op_desc = local_program.MutableBlock(0)->AppendOp();
+    op_desc->SetType("shaddow_output");
+    op_desc->SetAttr("name", name);
+    op_desc->SetInput("x", {name});
+    op_desc->SetOutput("out", {"@EMPTY@"});
+  }
+
+  paddle::translator::ProgramTranslator program_translator(&local_program,
+                                                           program.get());
+  program_translator.Translate();
+
+  auto res = paddle::dialect::PdOpLowerToKernelPass(program.get());
+
+  return res;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index f4d926d74c146..c639b966286cb 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -29,6 +29,11 @@
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/string/string_helper.h"
 
+#include "paddle/fluid/ir_adaptor/translator/program_translator.h"
+#include "paddle/ir/core/dialect.h"
+#include "paddle/ir/core/ir_context.h"
+#include "paddle/ir/core/program.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -218,12 +223,31 @@ class InterpreterCoreInfoCache {
   std::unordered_map<int64_t, InterpreterCoreInfo> info_map_;
 };
 
-std::shared_ptr<InterpreterCore> CreateInterpreterCoreInfoToCache(
+std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
     const ProgramDesc& program_desc,
     const platform::Place& place,
     bool is_grad,
     int64_t program_id,
     framework::Scope* scope);
 
+std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
+    std::unique_ptr<::ir::Program> ir_prog,
+    const platform::Place& place,
+    bool is_grad,
+    int64_t program_id,
+    framework::Scope* scope);
+
+std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
+    const paddle::framework::BlockDesc* forward_global_block,
+    const paddle::framework::BlockDesc* backward_global_block,
+    const std::vector<std::string> output_names,
+    const std::vector<paddle::Tensor>& x);
+
+std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
+    const paddle::framework::BlockDesc* backward_global_block,
+    const std::vector<paddle::Tensor>& out_grad,
+    const std::vector<paddle::Tensor*>& x_grad,
+    const std::vector<paddle::Tensor*>& params_grad);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 70be3b9dd035a..035f4cd4f16d9 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -958,7 +958,8 @@ void BuildOpFuncList(
 
     if (op_name == "builtin.combine" || op_name == "pd.feed" ||
         op_name == "builtin.set_parameter" ||
-        op_name == "builtin.get_parameter" || op_name == "builtin.slice") {
+        op_name == "builtin.get_parameter" || op_name == "builtin.slice" ||
+        op_name == "pd.feed_with_place" || op_name == "pd.shaddow_output") {
       VLOG(6) << "skip process " << op_name;
       continue;
     }
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 6552a14a03fcc..d4421ed7ab009 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -984,7 +984,7 @@ std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
   do {                                                            \
     if (paddle::framework::TransToProtoVarType(tensor.dtype()) == \
         proto_type) {                                             \
-      os << "  - dtype: " << proto_type << "\n";                  \
+      os << "  - dtype: " << tensor.dtype() << "\n";              \
       paddle::framework::print_tensor<cpp_type>(os, tensor);      \
       return os;                                                  \
     }                                                             \
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
index 1a880210afbe1..95702ac672113 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -66,8 +66,10 @@ paddle::framework::Variable* CreateVar(
   }
 
   paddle::framework::Variable* var = nullptr;
+
   std::string name = var_name_prefix + "_inner_var_" +
                      std::to_string(variable_2_var_name->size());
+
   if (force_persisable || is_persisable) {
     VLOG(6) << "Create var: " << name << " in scope " << inner_scope->root();
     var = const_cast<paddle::framework::Scope*>(inner_scope->root())->Var(name);
@@ -202,6 +204,15 @@ void HandleForSpecialOp(
     value_2_var_name->emplace(value, feed_var_name);
   }
 
+  if (op_name == "pd.feed_with_place") {
+    VLOG(6) << "Handle for pd.feed_with_place";
+    auto var_name =
+        op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
+
+    auto value = op->result(0);
+    value_2_var_name->emplace(value, var_name);
+  }
+
   if (op_name == "builtin.combine") {
     auto out_value = op->result(0);
 
@@ -252,6 +263,22 @@ void HandleForSpecialOp(
     (*value_2_var_name)[value] = param_name;
   }
 
+  if (op_name == "pd.shaddow_output") {
+    VLOG(6) << "Handle for pd.shaddow_ouptut";
+    auto var_name =
+        op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
+
+    auto value = op->operand(0);
+    // change opreand name to param_name
+    auto orig_name = value_2_var_name->at(value);
+
+    if (inner_scope->root()->FindVar(var_name) == nullptr) {
+      const_cast<paddle::framework::Scope*>(inner_scope->root())
+          ->Rename(orig_name, var_name);
+    }
+    (*value_2_var_name)[value] = var_name;
+  }
+
   if (op_name == "builtin.get_parameter") {
     VLOG(6) << "Handle for builtin.get_parameter:";
     auto param_name = op->attributes()
@@ -362,7 +389,8 @@ void BuildScope(const ir::Block& block,
 
     if (op_name == "pd.feed" || op_name == "pd.fetch" ||
         op_name == "builtin.combine" || op_name == "builtin.set_parameter" ||
-        op_name == "builtin.get_parameter" || op_name == "builtin.slice") {
+        op_name == "builtin.get_parameter" || op_name == "builtin.slice" ||
+        op_name == "pd.feed_with_place" || op_name == "pd.shaddow_output") {
       HandleForSpecialOp(op,
                          inner_scope,
                          var_name_prefix,
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
index d55ce6b24f9cf..beb4635bebba4 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -62,6 +62,20 @@ phi::KernelKey GetKernelKey(
             TransToPhiDataType(
                 op->result(0).type().dyn_cast<DenseTensorType>().dtype())};
   }
+
+  if (op->name() == "pd.feed_with_place") {
+    // NOTE, for now feed op don't need a kernel, so the data type from Op
+    // Result the next op use base program datatype
+    auto t =
+        op->attributes().at("place").dyn_cast<dialect::PlaceAttribute>().data();
+
+    auto backend = paddle::experimental::ParseBackend(t);
+    return {backend,
+            phi::DataLayout::ANY,
+            TransToPhiDataType(
+                op->result(0).type().dyn_cast<DenseTensorType>().dtype())};
+  }
+
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
   phi::DataLayout kernel_layout = phi::DataLayout::UNDEFINED;
   phi::DataType kernel_data_type = phi::DataType::UNDEFINED;
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 0aab57af7998a..ee2f66692eda8 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -954,6 +954,39 @@ struct FeedOpTranscriber : public OpTranscriber {
   }
 };
 
+struct FeedWithPlaceOpTranscriber : public OpTranscriber {
+  ir::AttributeMap TranslateOpAttribute(
+      ir::IrContext* ctx,
+      const std::string& normalized_op_name,
+      const OpAttributeInfoList& op_attr_infos,
+      const OpDesc& op_desc) override {
+    int allocate_type = paddle::get<int>(op_desc.GetAttr("place"));
+    ir::AttributeMap attribute_map = {
+        {"name",
+         ir::StrAttribute::get(ctx,
+                               op_desc.GetAttrIfExists<std::string>("name"))},
+        {"index", ir::Int64Attribute::get(ctx, 0)},
+        {"dtype",
+         paddle::dialect::DataTypeAttribute::get(ctx, phi::DataType::FLOAT32)},
+        {"place",
+         paddle::dialect::PlaceAttribute::get(
+             ctx, phi::Place(static_cast<phi::AllocationType>(allocate_type)))},
+    };
+
+    return attribute_map;
+  }
+
+  std::vector<ir::OpResult> GenerateOperationInput(
+      ir::IrContext* ctx,
+      TranslationContext* param_map,
+      const OpDesc& op_desc,
+      const std::string& normalized_op_name,
+      const OpInputInfoList& input_infos,
+      ir::Program* program) override {
+    return {};
+  }
+};
+
 struct SplitOpTranscriber : public OpTranscriber {
   std::vector<ir::OpResult> GenerateOperationInput(
       ir::IrContext* ctx,
@@ -1087,6 +1120,32 @@ struct FetchOpTranscriber : public OpTranscriber {
   }
 };
 
+struct ShaddowOutputOpTranscriber : public OpTranscriber {
+  ir::Operation* operator()(ir::IrContext* ctx,
+                            TranslationContext* param_map,
+                            const OpDesc& op_desc,
+                            ir::Program* program) override {
+    std::vector<ir::OpResult> op_inputs;
+    auto legacy_input_vars = op_desc.Input("x", true);
+
+    auto defining_info = (*param_map)[legacy_input_vars[0]];
+    op_inputs.push_back(defining_info.value);
+
+    ir::AttributeMap attribute_map = {
+        {"parameter_name",
+         ir::StrAttribute::get(ctx,
+                               op_desc.GetAttrIfExists<std::string>("name"))},
+    };
+
+    auto create_op_info = ctx->GetRegisteredOpInfo(ir::SetParameterOp::name());
+    ir::Operation* operation =
+        ir::Operation::Create(op_inputs, attribute_map, {}, create_op_info);
+    program->block()->push_back(operation);
+
+    return operation;
+  }
+};
+
 // NOTE, add_n op in legacy ops don't have a kernel, so we use a new op for now
 struct AddNOpTranscriber : public OpTranscriber {
   ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
@@ -1159,6 +1218,7 @@ struct OneHotTranscriber : public OpTranscriber {
 OpTranslator::OpTranslator() {
   general_handler = OpTranscriber();
   special_handlers["feed"] = FeedOpTranscriber();
+  special_handlers["feed_with_place"] = FeedWithPlaceOpTranscriber();
   special_handlers["fetch_v2"] = FetchOpTranscriber();
   special_handlers["cast"] = CastOpTranscriber();
   special_handlers["split"] = SplitOpTranscriber();
@@ -1167,8 +1227,10 @@ OpTranslator::OpTranslator() {
   special_handlers["assign_value"] = AssignValueOpTranscriber();
   special_handlers["increment"] = IncrementOpTranscriber();
   special_handlers["rnn"] = RnnOpTranscriber();
+  special_handlers["shaddow_output"] = ShaddowOutputOpTranscriber();
   special_handlers["one_hot_v2"] = OneHotTranscriber();
   special_handlers["add_n"] = AddNOpTranscriber();
+  special_handlers["sum"] = AddNOpTranscriber();
 }
 
 }  // namespace translator
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index b162e8198b993..202cfc61dd304 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -217,7 +217,15 @@ void ProgramTranslator::SetStopGradientAttributeForAllValue(
       continue;
     }
     ir::OpResult value = value_info.value;
+    if (!value) {
+      PADDLE_THROW(phi::errors::PreconditionNotMet(
+          "Value of [%s] can not ber None", var_name));
+    }
     auto* defining_op = value.owner();
+    PADDLE_ENFORCE_NOT_NULL(
+        defining_op,
+        phi::errors::PreconditionNotMet(
+            "Defining operator of [%s] can not be nullptr", var_name));
     VLOG(8) << "[op translated][stop gradient]" << var_name
             << " from: " << defining_op->name();
     std::vector<ir::Attribute> stop_gradients;
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index add6520493e1f..ed7c9d4237396 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1029,6 +1029,9 @@
 - op : feed
   outputs: {out: Out}
 
+- op : feed_with_place
+  outputs: {out: out}
+
 - op : fft_c2c
   inputs: {x: X}
   outputs: {out: Out}
@@ -2461,6 +2464,10 @@
   extra :
     attrs : [bool use_mkldnn=false]
 
+- op : shaddow_output
+  inputs: {x: x}
+  outputs: {out: out}
+
 - op : shape
   inputs :
     input : Input
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 661de64990ee6..8368184b2839d 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -826,6 +826,18 @@
   inplace: (x -> out)
   backward : expm1_grad
 
+- op : feed_with_place
+  args : (int64_t index, DataType dtype, str name, Place place)
+  output : Tensor(out)
+  infer_meta :
+    func : FeedWithPlaceInferMeta
+    param : [index, dtype]
+  kernel:
+    func : feed_with_place
+    param : [index, dtype]
+    data_type : dtype
+    backend : place
+
 - op : fft_c2c
   args : (Tensor x, int64_t[] axes, str normalization, bool forward)
   output : Tensor
@@ -2212,6 +2224,16 @@
   optional : master_param, master_param_out
   inplace : (param -> param_out), (master_param -> master_param_out)
 
+- op : shaddow_output
+  args : (Tensor x, str name)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel:
+    func : shaddow_output
+    param : [x]
+
 - op : shape
   args : (Tensor input)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml
index 5ac156ff5714d..216fca178fde7 100755
--- a/paddle/phi/api/yaml/static_ops.yaml
+++ b/paddle/phi/api/yaml/static_ops.yaml
@@ -244,18 +244,6 @@
     param : [num_rows, num_columns, dtype]
     data_type : dtype
 
-- op : feed_with_place
-  args : (int64_t index, DataType dtype, Place place)
-  output : Tensor(out)
-  infer_meta :
-    func : FeedWithPlaceInferMeta
-    param : [index, dtype]
-  kernel:
-    func : feed_with_place
-    param : [index, dtype]
-    data_type : dtype
-    backend : place
-
 - op : floor_divide
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor(out)
diff --git a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
index 342ad6a334cc3..5044bceda26bd 100644
--- a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
+++ b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
@@ -26,6 +26,11 @@ void FeedWithPlaceKernel(const Context& ctx,
                          phi::DataType data_type,
                          DenseTensor* out) {}
 
+template <typename T, typename Context>
+void ShaddowOutputKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         DenseTensor* out) {}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
@@ -44,3 +49,6 @@ PD_REGISTER_KERNEL(shaddow_feed,
                    phi::bfloat16,
                    phi::complex64,
                    phi::complex128) {}
+
+PD_REGISTER_KERNEL(
+    shaddow_output, CPU, ALL_LAYOUT, phi::ShaddowOutputKernel, float) {}
diff --git a/paddle/phi/kernels/feed_with_place_kernel.h b/paddle/phi/kernels/feed_with_place_kernel.h
index 4e8e9063c8d9b..725ec0c508af1 100644
--- a/paddle/phi/kernels/feed_with_place_kernel.h
+++ b/paddle/phi/kernels/feed_with_place_kernel.h
@@ -22,6 +22,12 @@ template <typename T, typename Context>
 void FeedWithPlaceKernel(const Context& ctx,
                          int64_t index,
                          phi::DataType data_type,
+                         // std::string name,
+                         DenseTensor* out);
+
+template <typename T, typename Context>
+void ShaddowOutputKernel(const Context& ctx,
+                         const DenseTensor& x,
                          DenseTensor* out);
 
 template <typename T, typename Context>
diff --git a/test/ir/new_ir/test_feed_with_place.py b/test/ir/new_ir/test_feed_with_place.py
index 5843fe227b1bf..222a5a86460b8 100644
--- a/test/ir/new_ir/test_feed_with_place.py
+++ b/test/ir/new_ir/test_feed_with_place.py
@@ -30,6 +30,7 @@ def feed_with_place():
             'index': 0,
             'dtype': 0,
             'place': 0,
+            'name': "x",
         },
     )
     return out
diff --git a/test/ir/new_ir/test_standalone_new_ir.py b/test/ir/new_ir/test_standalone_new_ir.py
index c67370b2e0a2f..4a00c2960c286 100644
--- a/test/ir/new_ir/test_standalone_new_ir.py
+++ b/test/ir/new_ir/test_standalone_new_ir.py
@@ -19,11 +19,10 @@
 
 import paddle
 
-paddle.enable_static()
-
 
 class TestNewIr(unittest.TestCase):
     def test_with_new_ir(self):
+        paddle.enable_static()
         place = (
             paddle.CUDAPlace(0)
             if paddle.is_compiled_with_cuda()
@@ -48,6 +47,7 @@ def test_with_new_ir(self):
 
 class TestCombineOp(unittest.TestCase):
     def test_with_new_ir(self):
+        paddle.enable_static()
         place = (
             paddle.CUDAPlace(0)
             if paddle.is_compiled_with_cuda()
@@ -72,6 +72,7 @@ def test_with_new_ir(self):
 
 class TestFeedOp(unittest.TestCase):
     def test_with_new_ir(self):
+        paddle.enable_static()
         place = (
             paddle.CUDAPlace(0)
             if paddle.is_compiled_with_cuda()
@@ -103,6 +104,7 @@ def test_with_new_ir(self):
 
 class TestSelectedRows(unittest.TestCase):
     def test_with_new_ir(self):
+        paddle.enable_static()
         # TODO(phlrain): support selected rows in GPU
         # place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
         place = paddle.CPUPlace()
@@ -127,6 +129,7 @@ def test_with_new_ir(self):
 
 class TestAddGradOp(unittest.TestCase):
     def test_with_new_ir(self):
+        paddle.enable_static()
         place = (
             paddle.CUDAPlace(0)
             if paddle.is_compiled_with_cuda()
@@ -141,11 +144,9 @@ def test_with_new_ir(self):
                 x = paddle.static.data("x", [2, 2], dtype="float32")
                 y = paddle.static.data("y", [2, 2], dtype="float32")
                 x.stop_gradient = False
-
                 z = x * y
 
                 paddle.static.gradients(z, x)
-
             np_a = np.random.rand(2, 2).astype("float32")
             np_b = np.random.rand(2, 2).astype("float32")
             out = exe.run(
@@ -159,8 +160,63 @@ def test_with_new_ir(self):
         np.testing.assert_array_equal(out[0], gold_res)
 
 
+class TestNewIrDygraph(unittest.TestCase):
+    def test_with_new_ir(self):
+        paddle.disable_static()
+        # paddle.device.set_device("cpu")
+
+        @paddle.jit.to_static
+        def func(x, y):
+            return x + y
+
+        x = paddle.ones([2, 2], dtype='float32')
+        y = paddle.ones([2, 2], dtype='float32')
+        z = func(x, y)
+
+        gold_res = np.ones([2, 2], dtype="float32") * 2
+        self.assertEqual(
+            np.array_equal(
+                z.numpy(),
+                gold_res,
+            ),
+            True,
+        )
+
+
+class TestNewIrBackwardDygraph(unittest.TestCase):
+    def test_with_new_ir(self):
+        paddle.disable_static()
+        build_strategy = paddle.static.BuildStrategy()
+        build_strategy.enable_inplace = False
+
+        @paddle.jit.to_static(build_strategy=build_strategy)
+        def func(x, y):
+            return x * y
+
+        x = paddle.ones([2, 2], dtype='float32')
+        y = paddle.ones([2, 2], dtype='float32')
+        x.stop_gradient = False
+        y.stop_gradient = False
+        z = func(x, y)
+        loss = z.mean()
+        loss.backward()
+        gold_res = np.ones([2, 2], dtype="float32")
+        self.assertEqual(
+            np.array_equal(
+                z.numpy(),
+                gold_res,
+            ),
+            True,
+        )
+
+        gold_res = np.ones([2, 2], dtype="float32") * 0.25
+        np.testing.assert_array_equal(x.gradient(), gold_res)
+        np.testing.assert_array_equal(y.gradient(), gold_res)
+
+
 class TestSplitOp(unittest.TestCase):
     def test_with_new_ir(self):
+        paddle.enable_static()
         place = (
             paddle.CUDAPlace(0)
             if paddle.is_compiled_with_cuda()
@@ -186,4 +242,5 @@ def test_with_new_ir(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()

From 6093a7ed0a383737102e2c50c4b71aae456c0bf0 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 25 Jul 2023 13:41:19 +0800
Subject: [PATCH 27/34] Update ccache (#55136)

* Update ccache

* del 3.7.9

* fix error
---
 tools/dockerfile/Dockerfile.ubuntu20 | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.ubuntu20 b/tools/dockerfile/Dockerfile.ubuntu20
index 98bc6ec80731a..f7aeb670bab65 100644
--- a/tools/dockerfile/Dockerfile.ubuntu20
+++ b/tools/dockerfile/Dockerfile.ubuntu20
@@ -147,13 +147,14 @@ RUN pip3.7 --no-cache-dir install -r /root/requirements.txt && \
 #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 #CMD source ~/.bashrc
 
-# ccache 3.7.9
-RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
-    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
-    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+# ccache 4.2.0
+RUN wget -q https://paddle-ci.gz.bcebos.com/ccache-4.8.2.tar.gz && \
+    tar xf ccache-4.8.2.tar.gz && mkdir /usr/local/ccache-4.8.2 && cd ccache-4.8.2 && \
+    mkdir build && cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/ccache-4.8.2 .. && \
     make -j8 && make install && \
-    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
-    cd ../ && rm -rf ccache-3.7.9 ccache-3.7.9.tar.gz
+    ln -s /usr/local/ccache-4.8.2/bin/ccache /usr/local/bin/ccache && \
+    cd ../../ && rm -rf ccache-4.8.2.tar.gz
 
 # clang+llvm 3.8.0
 RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ 

From 690ffe814dbfc5054d4e92df878687fd638fe3a5 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Tue, 25 Jul 2023 13:56:45 +0800
Subject: [PATCH 28/34] fix div 0 bug (#55644)

---
 paddle/phi/kernels/cpu/nanmedian_kernel.cc       |  8 ++++++++
 paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu | 12 ++++++++++++
 paddle/phi/kernels/gpu/nanmedian_kernel.cu       |  8 ++++++++
 3 files changed, 28 insertions(+)

diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
index 558d5aaebfef8..92571124dd1a6 100644
--- a/paddle/phi/kernels/cpu/nanmedian_kernel.cc
+++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -116,6 +116,14 @@ void ProcessMedianKernel(const Context& dev_ctx,
   auto x_dim = x.dims();
   int64_t x_rank = x_dim.size();
   int64_t stride = x_dim[x_rank - 1];
+
+  PADDLE_ENFORCE_NE(
+      stride,
+      0,
+      phi::errors::InvalidArgument("The input Tensor x's shape[-1] should not "
+                                   "be 0, but shape is %s now.",
+                                   x_dim));
+
   int64_t pre_dim = numel / stride;
   int64_t i = 0;
 
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index e4ee1f342131a..33de3c8e17876 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -340,6 +340,18 @@ void MatrixRankTolKernel(const Context& dev_ctx,
   auto dim_out = out->dims();
   int rows = dim_x[dim_x.size() - 2];
   int cols = dim_x[dim_x.size() - 1];
+  PADDLE_ENFORCE_NE(
+      rows,
+      0,
+      phi::errors::InvalidArgument("The input Tensor x's shape[-2] should not "
+                                   "be 0, but shape is %s now.",
+                                   dim_x));
+  PADDLE_ENFORCE_NE(
+      cols,
+      0,
+      phi::errors::InvalidArgument("The input Tensor x's shape[-1] should not "
+                                   "be 0, but shape is %s now.",
+                                   dim_x));
   int k = std::min(rows, cols);
   auto numel = x.numel();
   int batches = numel / (rows * cols);
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
index 8a6be7a9bdef0..5a9d3a07cf55d 100644
--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -149,6 +149,14 @@ void ProcessMedianKernel(const Context& dev_ctx,
   auto x_dim = x.dims();
   int64_t x_rank = x_dim.size();
   int64_t stride = x_dim[x_rank - 1];
+
+  PADDLE_ENFORCE_NE(
+      stride,
+      0,
+      phi::errors::InvalidArgument("The input Tensor x's shape[-1] should not "
+                                   "be 0, but shape is %s now.",
+                                   x_dim));
+
   int64_t pre_dim = numel / stride;
   int64_t i = 0;
 

From 0cd422b66ad449af65685db3a345a423f90dbd71 Mon Sep 17 00:00:00 2001
From: Lucas <33367939+cqulilujia@users.noreply.github.com>
Date: Tue, 25 Jul 2023 14:25:42 +0800
Subject: [PATCH 29/34] fix bugs in rnn op (#55656)

---
 paddle/phi/kernels/xpu/rnn_kernel.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/xpu/rnn_kernel.cc b/paddle/phi/kernels/xpu/rnn_kernel.cc
index 10fdfdbc4b91f..87773c8a97267 100644
--- a/paddle/phi/kernels/xpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/xpu/rnn_kernel.cc
@@ -44,7 +44,7 @@ void RnnKernel(const Context& dev_ctx,
   }
 
   dropout_state->Resize(out->dims());
-  dev_ctx.template Alloc<T>(dropout_state);
+  dev_ctx.template Alloc<uint8_t>(dropout_state);
 
   phi::funcs::SetConstant<phi::XPUContext, uint8_t> ones;
   ones(dev_ctx, dropout_state, static_cast<uint8_t>(1));
@@ -97,7 +97,7 @@ void RnnKernel(const Context& dev_ctx,
 
   int gate_num = 4;
   int hidden_data_idx = (num_layers - 1);
-  hidden_data_idx += (gate_num + 1) * num_layers;
+  hidden_data_idx += (gate_num + 2) * num_layers;
   const int& block_size = direction_num * seq_len * batch_size * hidden_size;
   reserve->Resize({hidden_data_idx, block_size});
   dev_ctx.template Alloc<T>(reserve);

From 7da1ffbe4086bbe9bbc9368baa29c04d3e40d4ac Mon Sep 17 00:00:00 2001
From: wentao yu <yuwentao126@126.com>
Date: Tue, 25 Jul 2023 14:37:07 +0800
Subject: [PATCH 30/34] remove fluid allreduce op (#55672)

---
 .../operators/collective/allreduce_op.cc      | 84 -----------------
 .../operators/collective/allreduce_op.cu.cc   | 28 ------
 .../fluid/operators/collective/allreduce_op.h | 93 -------------------
 3 files changed, 205 deletions(-)
 delete mode 100644 paddle/fluid/operators/collective/allreduce_op.cc
 delete mode 100644 paddle/fluid/operators/collective/allreduce_op.cu.cc
 delete mode 100644 paddle/fluid/operators/collective/allreduce_op.h

diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc
deleted file mode 100644
index dc3dfff58e96e..0000000000000
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/allreduce_op.h"
-
-#include <future>  // NOLINT
-#include <ostream>
-
-namespace paddle {
-namespace operators {
-
-class AllReduceDelOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class AllReduceDelOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), tensor to be allreduced.");
-    AddOutput("Out", "(Tensor) the result of allreduced.");
-    AddAttr<int>("reduce_type", "(int) determine the reduce type.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-***AllReduce Operator***
-
-Call NCCL AllReduce internally. Note that this op must be used when one
-thread is managing one GPU device.
-
-For speed reasons, reduce_type should be an integer:
-
-0: sum
-1: prod
-2: max
-3: min
-
-If input and output are the same variable, in-place allreduce will be used.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(allreduce,
-                             ops::AllReduceDelOp,
-                             ops::AllReduceDelOpMaker);
-
-PD_REGISTER_STRUCT_KERNEL(allreduce,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::AllReduceOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t,
-                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc
deleted file mode 100644
index 0c9b95c76866b..0000000000000
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/allreduce_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-PD_REGISTER_STRUCT_KERNEL(allreduce,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::AllReduceOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t,
-                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
deleted file mode 100644
index 794e37c312a9b..0000000000000
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class AllReduceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "AllReduce op can run on gpu place only for now."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    auto in = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-
-    int dtype =
-        platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
-    int64_t numel = in->numel();
-    auto* sendbuff = in->data();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    auto* comm = dev_ctx.nccl_comm();
-    // FIXME(typhoonzero): should use nccl stream here.
-    auto stream = dev_ctx.stream();
-    PADDLE_ENFORCE_NOT_NULL(
-        stream, platform::errors::NotFound("Should initialize NCCL firstly."));
-
-    int reduce_type = ctx.Attr<int>("reduce_type");
-    ncclRedOp_t red_type = ncclSum;
-    switch (reduce_type) {
-      case 0:
-        red_type = ncclSum;
-        break;
-      case 1:
-        red_type = ncclProd;
-        break;
-      case 2:
-        red_type = ncclMax;
-        break;
-      case 3:
-        red_type = ncclMin;
-        break;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclAllReduce(sendbuff,
-                                         recvbuff,
-                                         numel,
-                                         static_cast<ncclDataType_t>(dtype),
-                                         red_type,
-                                         comm,
-                                         stream));
-    if (ctx.Attr<bool>("sync_mode")) {
-      platform::GpuStreamSync(stream);
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle

From c737f0aedb64a3a62967ed8683f68f4fa677e8f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=82=85=E5=89=91=E5=AF=92?= <Xs1580802568@gmail.com>
Date: Tue, 25 Jul 2023 14:37:51 +0800
Subject: [PATCH 31/34] add all false bool indices support for index_put
 (#55655)

---
 .../phi/kernels/cpu/index_put_grad_kernel.cc  |  14 +++
 paddle/phi/kernels/cpu/index_put_kernel.cc    |   6 +
 paddle/phi/kernels/funcs/index_put_utils.h    |   5 +
 .../phi/kernels/gpu/index_put_grad_kernel.cu  |  15 +++
 paddle/phi/kernels/gpu/index_put_kernel.cu    |   6 +
 test/legacy_test/test_index_put_op.py         | 107 +++++++++++++++---
 6 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/paddle/phi/kernels/cpu/index_put_grad_kernel.cc b/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
index 9426a5def6527..89b3ba61b83d3 100644
--- a/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
@@ -16,6 +16,7 @@
 #include <numeric>
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/index_put_utils.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
@@ -188,6 +189,19 @@ void IndexPutGradKernel(const Context& dev_ctx,
   std::vector<DenseTensor> tmp_args;
   std::vector<const phi::DenseTensor*> int_indices_v =
       funcs::DealWithBoolIndices<T, Context>(dev_ctx, indices, &tmp_args);
+  if (int_indices_v.empty()) {
+    if (x_grad) {
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    }
+    if (value_grad) {
+      FullKernel<T, Context>(dev_ctx,
+                             phi::vectorize(value_grad->dims()),
+                             0.0f,
+                             value_grad->dtype(),
+                             value_grad);
+    }
+    return;
+  }
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
   std::vector<int64_t> res_dim_v(phi::vectorize(bd_dim));
diff --git a/paddle/phi/kernels/cpu/index_put_kernel.cc b/paddle/phi/kernels/cpu/index_put_kernel.cc
index 3ab05fea8fa47..34701539c8ef3 100644
--- a/paddle/phi/kernels/cpu/index_put_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_put_kernel.cc
@@ -117,6 +117,12 @@ void IndexPutKernel(const Context& dev_ctx,
   std::vector<DenseTensor> tmp_args;
   std::vector<const phi::DenseTensor*> int_indices_v =
       funcs::DealWithBoolIndices<T, Context>(dev_ctx, indices, &tmp_args);
+  if (int_indices_v.empty()) {
+    if (!out->initialized()) {
+      phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    }
+    return;
+  }
 
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index c135cb82e2ec3..1e0c9eaaaf5a5 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -88,6 +88,11 @@ std::vector<const phi::DenseTensor*> DealWithBoolIndices(
       nonzero_indices.Resize(phi::make_ddim({-1, rank}));
       NonZeroKernel<bool, Context>(dev_ctx, *indices_v[i], &nonzero_indices);
 
+      if (nonzero_indices.numel() == 0) {
+        std::vector<const phi::DenseTensor*> empty_indices;
+        return empty_indices;
+      }
+
       std::vector<phi::DenseTensor*> integer_indices(rank, nullptr);
       const int tmp_ix = tmp_indices_v->size();
       for (int i = 0; i < rank; ++i) {
diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
index 1a301323cad87..8f2eba7185293 100644
--- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/index_put_utils.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
@@ -219,6 +220,20 @@ void IndexPutGradKernel(const Context& dev_ctx,
   std::vector<DenseTensor> tmp_args;
   std::vector<const phi::DenseTensor*> int_indices_v =
       funcs::DealWithBoolIndices<T, Context>(dev_ctx, indices, &tmp_args);
+  if (int_indices_v.empty()) {
+    if (x_grad) {
+      phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+    }
+    if (value_grad) {
+      FullKernel<T, Context>(dev_ctx,
+                             phi::vectorize(value_grad->dims()),
+                             0.0f,
+                             value_grad->dtype(),
+                             value_grad);
+    }
+    return;
+  }
+
   const size_t total_dims = x.dims().size();
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu
index 9710f5baca77d..4244e755b6597 100644
--- a/paddle/phi/kernels/gpu/index_put_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_kernel.cu
@@ -118,6 +118,12 @@ void IndexPutKernel(const Context& dev_ctx,
   std::vector<DenseTensor> tmp_args;
   std::vector<const phi::DenseTensor*> int_indices_v =
       funcs::DealWithBoolIndices<T, Context>(dev_ctx, indices, &tmp_args);
+  if (int_indices_v.empty()) {
+    if (!out->initialized()) {
+      phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    }
+    return;
+  }
   const size_t total_dims = x.dims().size();
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
diff --git a/test/legacy_test/test_index_put_op.py b/test/legacy_test/test_index_put_op.py
index f21f7b084bde4..44e50c1567721 100644
--- a/test/legacy_test/test_index_put_op.py
+++ b/test/legacy_test/test_index_put_op.py
@@ -47,14 +47,15 @@ def has_duplicate_index(indices, shapes):
         return True
 
 
-def gen_indices_np(x_shape, indices_shapes, index_type):
+def gen_indices_np(x_shape, indices_shapes, index_type, is_all_false):
     indices = []
     if index_type == np.bool_:
         indice = np.zeros(indices_shapes[0], dtype=np.bool_)
-        indice.flatten()
-        for i in range(len(indice)):
-            indice[i] = (i & 1) == 0
-        indice = indice.reshape(indices_shapes[0])
+        if not is_all_false:
+            indice.flatten()
+            for i in range(len(indice)):
+                indice[i] = (i & 1) == 0
+            indice = indice.reshape(indices_shapes[0])
         indices.append(indice)
     else:
         while True:
@@ -78,6 +79,7 @@ def gen_indices_np(x_shape, indices_shapes, index_type):
 class TestIndexPutAPIBase(unittest.TestCase):
     def setUp(self):
         self.mixed_indices = False
+        self.is_all_false = False
         self.init_dtype_type()
         self.setPlace()
         self.x_np = np.random.random(self.x_shape).astype(self.dtype_np)
@@ -85,17 +87,26 @@ def setUp(self):
 
         if self.mixed_indices:
             tmp_indices_np1 = gen_indices_np(
-                self.x_shape, self.indices_shapes, self.index_type_np
+                self.x_shape,
+                self.indices_shapes,
+                self.index_type_np,
+                self.is_all_false,
             )
             tmp_indices_np2 = gen_indices_np(
-                self.x_shape, self.indices_shapes1, self.index_type_np1
+                self.x_shape,
+                self.indices_shapes1,
+                self.index_type_np1,
+                self.is_all_false,
             )
             self.indices_np = tuple(
                 list(tmp_indices_np1) + list(tmp_indices_np2)
             )
         else:
             self.indices_np = gen_indices_np(
-                self.x_shape, self.indices_shapes, self.index_type_np
+                self.x_shape,
+                self.indices_shapes,
+                self.index_type_np,
+                self.is_all_false,
             )
 
     def init_dtype_type(self):
@@ -565,6 +576,32 @@ def init_dtype_type(self):
         self.accumulate = True
 
 
+class TestIndexPutAPI31(TestIndexPutAPIBase):
+    def init_dtype_type(self):
+        self.dtype_np = np.bool_
+        self.index_type_np = np.int32
+        self.x_shape = (100, 110)
+        self.indices_shapes = [(21,), (21,)]
+        self.value_shape = (21,)
+        self.dtype_pd = paddle.bool
+        self.index_type_pd = paddle.int32
+        self.accumulate = False
+        self.is_all_false = True
+
+
+class TestIndexPutAPI32(TestIndexPutAPIBase):
+    def init_dtype_type(self):
+        self.dtype_np = np.bool_
+        self.index_type_np = np.int32
+        self.x_shape = (100, 110)
+        self.indices_shapes = [(21,), (21,)]
+        self.value_shape = (21,)
+        self.dtype_pd = paddle.bool
+        self.index_type_pd = paddle.int32
+        self.accumulate = True
+        self.is_all_false = True
+
+
 class TestIndexPutInplaceAPI(unittest.TestCase):
     def setUp(self):
         self.init_dtype_type()
@@ -572,7 +609,7 @@ def setUp(self):
         self.x_np = np.random.random(self.x_shape).astype(self.dtype_np)
         self.value_np = np.random.random(self.value_shape).astype(self.dtype_np)
         self.indices_np = gen_indices_np(
-            self.x_shape, self.indices_shapes, self.index_type_np
+            self.x_shape, self.indices_shapes, self.index_type_np, False
         )
 
     def init_dtype_type(self):
@@ -678,7 +715,7 @@ def test_backward(self):
                 atol=1e-7,
             )
 
-    def test_backwardScalarVal(self):
+    def test_backward_scalarval(self):
         paddle.disable_static()
         for place in self.place:
             paddle.device.set_device(place)
@@ -719,7 +756,7 @@ def test_backwardScalarVal(self):
                 np.array([4.0], dtype=np.float64), dvalue.numpy(), atol=1e-7
             )
 
-    def test_backwardBroadCastValue(self):
+    def test_backward_broadcastvalue(self):
         paddle.disable_static()
         for place in self.place:
             paddle.device.set_device(place)
@@ -764,7 +801,7 @@ def test_backwardBroadCastValue(self):
                 atol=1e-7,
             )
 
-    def test_backwardBroadCastValue1(self):
+    def test_backward_broadcastvalue1(self):
         paddle.disable_static()
         for place in self.place:
             paddle.device.set_device(place)
@@ -809,7 +846,7 @@ def test_backwardBroadCastValue1(self):
                 atol=1e-7,
             )
 
-    def test_backwardBroadCastValue2(self):
+    def test_backward_broadcastvalue2(self):
         paddle.disable_static()
         for place in self.place:
             paddle.device.set_device(place)
@@ -854,6 +891,50 @@ def test_backwardBroadCastValue2(self):
                 atol=1e-7,
             )
 
+    def test_backward_all_false_bool_indice(self):
+        paddle.disable_static()
+        for place in self.place:
+            paddle.device.set_device(place)
+            value = paddle.ones(shape=[2, 1], dtype=paddle.float64)
+            x = paddle.ones(shape=[16, 21], dtype=paddle.float64)
+            ix = paddle.zeros(shape=[16, 21], dtype=paddle.bool)
+
+            value.stop_gradient = False
+            x.stop_gradient = False
+            out = paddle.index_put(x, (ix,), value, False)
+
+            dx, dvalue = paddle.grad(
+                outputs=[out],
+                inputs=[x, value],
+                create_graph=False,
+                retain_graph=True,
+            )
+            ref_dx = np.ones(shape=[16, 21], dtype=np.float64)
+
+            np.testing.assert_allclose(ref_dx, dx.numpy(), atol=1e-7)
+            np.testing.assert_allclose(
+                np.array([[0.0], [0.0]], dtype=np.float64),
+                dvalue.numpy(),
+                atol=1e-7,
+            )
+
+            out = paddle.index_put(x, (ix,), value, True)
+
+            dx, dvalue = paddle.grad(
+                outputs=[out],
+                inputs=[x, value],
+                create_graph=False,
+                retain_graph=True,
+            )
+            ref_dx = np.ones(shape=[16, 21], dtype=np.float64)
+
+            np.testing.assert_allclose(ref_dx, dx.numpy(), atol=1e-7)
+            np.testing.assert_allclose(
+                np.array([[0.0], [0.0]], dtype=np.float64),
+                dvalue.numpy(),
+                atol=1e-7,
+            )
+
     def test_backward_in_static(self):
         paddle.enable_static()
         exe = paddle.static.Executor()

From f9e1b2d2ad1d9964a80d20df72609ef4d18f83fe Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 25 Jul 2023 15:05:40 +0800
Subject: [PATCH 32/34] [NewIR]Support Instruction.Run in CINN for
 Runtime::Program (#55680)

---
 paddle/cinn/hlir/framework/new_ir_compiler.h  | 53 +++++++++++++------
 .../cpp/ir/cinn/graph_compiler_new_ir_test.cc | 20 +++++--
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/paddle/cinn/hlir/framework/new_ir_compiler.h b/paddle/cinn/hlir/framework/new_ir_compiler.h
index c92e84563e7aa..fc4944d1ca241 100644
--- a/paddle/cinn/hlir/framework/new_ir_compiler.h
+++ b/paddle/cinn/hlir/framework/new_ir_compiler.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <absl/types/variant.h>
 #include <memory>
 #include <unordered_map>
 #include "paddle/cinn/common/context.h"
@@ -30,9 +31,15 @@ namespace cinn {
 namespace hlir {
 namespace framework {
 
-// TODO(Aurelius): Need add name mapping logic in REGISTER_CINN_OP
-// macros or attempt to unify Op name with Paddle and CINN.
-static const std::unordered_map<std::string, std::string> OP_NAMES = {
+struct CompatibleInfo {
+  static constexpr char* kInputPrefix = "input_";
+  static constexpr char* kOutputPrefix = "output_";
+  // TODO(Aurelius): Need add name mapping logic in REGISTER_CINN_OP
+  // macros or attempt to unify Op name with Paddle and CINN.
+  static const std::unordered_map<std::string, std::string> OP_NAMES;
+};
+
+const std::unordered_map<std::string, std::string> CompatibleInfo::OP_NAMES = {
     {"pd.full", "fill_constant"}, {"pd.matmul", "matmul"}};
 
 // TODO(Aurelius84): Need abstract this logic to implement Proxy for
@@ -70,18 +77,32 @@ class NewIRCompiler final {
     compiler_->Build(build_module, "");
 
     auto instructions = BuildInstructions(groups);
+
+    // TODO(Aurelius84): Instantiate all tensors on compile-time, which is
+    // controlled by 'options.with_instantiate_variables' in GraphCompiler.
+    // Moreover, it's better to implement InsertBufferHandlers() logic
+    // to automatically insert Malloc and Free instructions.
+    for (auto& name : scope_->var_names()) {
+      std::string var_name({name.data(), name.size()});
+      VLOG(4) << "Instantiate " << var_name << " on compile-time";
+      auto* var = scope_->Var<Tensor>(var_name);
+      auto& tensor = absl::get<Tensor>(*var);
+      tensor->mutable_data(target_, tensor->type());
+    }
     return std::make_unique<Program>(scope_, std::move(instructions));
   }
 
   std::vector<ir::LoweredFunc> GetOpFunc(const ::ir::Operation& op, int idx) {
     std::vector<ir::Tensor> inputs;
     std::vector<common::CINNValue> cinn_inputs;
-    VLOG(4) << "GetOpFunc for op: " << op.name();
+    auto op_name = op.name();
+    VLOG(4) << "GetOpFunc for op: " << op_name;
     // step 1: Deal with Oprands
     for (int i = 0; i < op.num_operands(); ++i) {
       auto in_value = op.operand(i);
       // TODO(Aurelius84): For now, use addr as name but it's not wise.
-      std::string input_id = std::to_string(std::hash<::ir::Value>()(in_value));
+      std::string input_id = CompatibleInfo::kInputPrefix +
+                             std::to_string(std::hash<::ir::Value>()(in_value));
       // NOTE(Aurelius84): whether need to support other Type?
       auto type_info =
           in_value.type().dyn_cast<paddle::dialect::DenseTensorType>();
@@ -100,8 +121,7 @@ class NewIRCompiler final {
       cinn_inputs.push_back(common::CINNValue(temp));
     }
     for (auto out_name : OpGetOutputNames(op)) {
-      cinn_inputs.push_back(
-          common::CINNValue(op.name().substr(3) + "_" + out_name));
+      cinn_inputs.push_back(common::CINNValue(out_name));
     }
 
     VLOG(4) << "inputs.size(): " << inputs.size();
@@ -124,14 +144,14 @@ class NewIRCompiler final {
     {
       VLOG(4) << "op.attributes():" << op.attributes().size();
       auto attrs = utils::ConvertAttributes(op.attributes());
-      node_attrs.node_name = OP_NAMES.at(op.name());
+      node_attrs.node_name = CompatibleInfo::OP_NAMES.at(op_name);
       node_attrs.attr_store = std::move(attrs);
     }
     auto& strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
     // NOTE(Aurelius84): Do we need replace all hlir::framework Operator with
     // ::ir::Program ？
     const hlir::framework::Operator* cinn_op =
-        Operator::Get(OP_NAMES.at(op.name()));
+        Operator::Get(CompatibleInfo::OP_NAMES.at(op_name));
     auto impl = OpStrategy::SelectImpl(
         strategy[cinn_op](node_attrs, inputs, out_types, out_shapes, target_));
     common::CINNValuePack C =
@@ -223,7 +243,8 @@ class NewIRCompiler final {
     std::unordered_set<std::string> repeat;
     for (int i = 0; i < op.num_operands(); ++i) {
       auto value = op.operand(i);
-      std::string name = std::to_string(std::hash<::ir::Value>()(value));
+      std::string name = CompatibleInfo::kInputPrefix +
+                         std::to_string(std::hash<::ir::Value>()(value));
       if (repeat.count(name)) {
         continue;
       }
@@ -237,7 +258,8 @@ class NewIRCompiler final {
     std::vector<std::string> names;
     for (int i = 0; i < op.num_results(); ++i) {
       auto value = op.result(i);
-      std::string name = std::to_string(std::hash<::ir::Value>()(value));
+      std::string name = CompatibleInfo::kOutputPrefix +
+                         std::to_string(std::hash<::ir::Value>()(value));
       names.push_back(std::move(name));
     }
     return names;
@@ -257,11 +279,12 @@ std::shared_ptr<Scope> BuildScope(const Target& target,
   std::unordered_set<::ir::Value> visited;
   auto scope = std::make_shared<Scope>();
 
-  auto create_var = [&](::ir::Value value) {
+  auto create_var = [&](const std::string& name_prefix, ::ir::Value value) {
     if (visited.count(value) > 0) return;
     visited.emplace(value);
 
-    std::string name = std::to_string(std::hash<::ir::Value>()(value));
+    std::string name =
+        name_prefix + std::to_string(std::hash<::ir::Value>()(value));
     auto type_info = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
     auto* var = scope->Var<Tensor>(name);
     auto& tensor = absl::get<Tensor>(*var);
@@ -279,12 +302,12 @@ std::shared_ptr<Scope> BuildScope(const Target& target,
     // visit OpOprands
     for (auto i = 0; i < (*it)->num_operands(); ++i) {
       auto in_value = (*it)->operand(i);
-      create_var(in_value);
+      create_var(CompatibleInfo::kInputPrefix, in_value);
     }
 
     for (auto i = 0; i < (*it)->num_results(); ++i) {
       auto out_value = (*it)->result(i);
-      create_var(out_value);
+      create_var(CompatibleInfo::kOutputPrefix, out_value);
     }
   }
   return scope;
diff --git a/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc b/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc
index a5f04d257357a..42ef6fe53d059 100644
--- a/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc
+++ b/test/cpp/ir/cinn/graph_compiler_new_ir_test.cc
@@ -24,6 +24,7 @@
 #include "paddle/cinn/frontend/net_builder.h"
 #include "paddle/cinn/frontend/optimize.h"
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/utils/data_util.h"
 
 #include "paddle/cinn/hlir/framework/new_ir_compiler.h"
 
@@ -33,15 +34,16 @@ TEST(GraphCompier, TestNewIR) {
   ::ir::Program program(ctx);
   ::ir::Builder builder = ::ir::Builder(ctx, program.block());
 
+  const float value = 2.0;
   auto full_op_x =
       builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{64, 128},
-                                             1.0,
+                                             value,
                                              phi::DataType::FLOAT32,
                                              phi::CPUPlace());
 
   auto full_op_y =
       builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{128, 64},
-                                             2.0,
+                                             value,
                                              phi::DataType::FLOAT32,
                                              phi::CPUPlace());
   // TODO(Aurelius84): test more op
@@ -61,7 +63,15 @@ TEST(GraphCompier, TestNewIR) {
   cinn::hlir::framework::NewIRCompiler ir_compiler(program, target, scope);
   auto runtime_program = ir_compiler.Build();
 
-  // FIXME(Aurelius84): It raised illegal memory access while deconstructor
-  // after running all instruction, but it's ok under GLOG_v=10.
-  // ASSERT_NO_THROW(runtime_program->Execute());
+  ASSERT_NO_THROW(runtime_program->Execute());
+
+  for (auto& var_name : scope->var_names()) {
+    std::string name = {var_name.begin(), var_name.end()};
+    std::vector<float> data =
+        cinn::GetTensorData<float>(scope->GetTensor(name), target);
+    for (int i = 0; i < data.size(); ++i) {
+      LOG_FIRST_N(INFO, 3) << "data: " << data[i];
+      ASSERT_NEAR(data[i], value, 1e-5);
+    }
+  }
 }

From 017a6164d934b001c25cb8c454345ad449747c24 Mon Sep 17 00:00:00 2001
From: Jeng Bai-Cheng <jeng1220@users.noreply.github.com>
Date: Tue, 25 Jul 2023 15:24:05 +0800
Subject: [PATCH 33/34] Bugfix, fast layer norm, OOB (#55639)

* Fix LayerNormForward perf issue

* Bugfix, fast_layer_norm OOB

* apply pre-commit

---------

Co-authored-by: Shijie Wang <jaywan@nvidia.com>
---
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 21 +++++++++++++++----
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |  6 +++---
 test/legacy_test/test_layer_norm_op.py        |  8 ++++++-
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 1d067b0fc2918..e2d908b853188 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -217,8 +217,13 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
   Vec_scale beta[LDGS];
 #pragma unroll
   for (int it = 0, col = c; it < LDGS; it++) {
-    phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
-    phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    if (col < cols) {
+      phi::Load<ScaleT, VecSize>(gamma_ptr + col * VecSize, &gamma[it]);
+      phi::Load<ScaleT, VecSize>(beta_ptr + col * VecSize, &beta[it]);
+    } else {
+      gamma[it] = Vec_scale{};
+      beta[it] = Vec_scale{};
+    }
     col += THREADS_PER_ROW;
   }
 
@@ -227,7 +232,12 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
     Vec x[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
+      if (col < cols) {
+        phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize,
+                              &x[it]);
+      } else {
+        x[it] = Vec{};
+      }
       col += THREADS_PER_ROW;
     }
     U xf[LDGS * VecSize];
@@ -324,7 +334,10 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
+      if (col < cols) {
+        phi::Store<T, VecSize>(x[it],
+                               y_ptr + row * ELTS_PER_ROW + col * VecSize);
+      }
       col += THREADS_PER_ROW;
     }
   }
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 34425d8cfcfe2..c5bb0c288f260 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -578,7 +578,8 @@ void LayerNormKernel(const Context &dev_ctx,
                                    VecSize,                                  \
                                    WARPS_M,                                  \
                                    WARPS_N,                                  \
-                                   BYTES_PER_LDG>                            \
+                                   BYTES_PER_LDG,                            \
+                                   feature_size>                             \
         <<<grid, THREADS_PER_CTA, 0, stream>>>(                              \
             batch_size,                                                      \
             feature_size,                                                    \
@@ -605,8 +606,7 @@ void LayerNormKernel(const Context &dev_ctx,
   if ((feature_size >= 768 && feature_size <= 2048 && feature_size % 256 == 0 ||
        feature_size == 4096) &&
       scale != nullptr && bias != nullptr) {
-    // can_call_fast_kernel = true;
-    can_call_fast_kernel = false;
+    can_call_fast_kernel = true;
   }
 
   if (can_call_fast_kernel) {
diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py
index 32d23ad3e1c72..4af8c68346230 100644
--- a/test/legacy_test/test_layer_norm_op.py
+++ b/test/legacy_test/test_layer_norm_op.py
@@ -515,7 +515,13 @@ def setUp(self):
         self.use_cudnn = True
 
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
-        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+        np.testing.assert_allclose(
+            np.array(tensor).flatten(),
+            np_array.flatten(),
+            rtol=1e-3,
+            atol=atol,
+            err_msg=msg,
+        )
 
     def check_forward_backward(
         self,

From 8db3ff1f8daa12f9cdde98a2d95a2134ea5b61d7 Mon Sep 17 00:00:00 2001
From: lishicheng1996 <43111799+lishicheng1996@users.noreply.github.com>
Date: Tue, 25 Jul 2023 16:15:06 +0800
Subject: [PATCH 34/34] fix a bug caused by hipcc lambda value capture (#55612)

---
 paddle/phi/kernels/funcs/blas/blas_impl.hip.h | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
index 6aa41e4f4a2b6..805a718ab85ed 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -1173,6 +1173,56 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float16 alpha,
+                                               const float16 *A,
+                                               const float16 *B,
+                                               float16 beta,
+                                               float16 *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  const int64_t strideC = M * N;
+  context_.CublasCall([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm_strided_batched(
+        handle,
+        cuTransB,
+        cuTransA,
+        N,
+        M,
+        K,
+        reinterpret_cast<const rocblas_half *>(&alpha),
+        reinterpret_cast<const rocblas_half *>(B),
+        ldb,
+        strideB,
+        reinterpret_cast<const rocblas_half *>(A),
+        lda,
+        strideA,
+        reinterpret_cast<const rocblas_half *>(&beta),
+        reinterpret_cast<rocblas_half *>(C),
+        ldc,
+        strideC,
+        batchCount));
+  });
+}
+
 // note(wangran16): unknown bug. parameters dislocation when calling
 // GEMM_STRIDED_BATCH<float> and GEMM_STRIDED_BATCH<double>
 template <>