Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… enable_fc_passes_
PaddlePaddle · Oct 12, 2022 · 1538443 · 1538443
2 parents e08d033 + 5303b66
commit 1538443
Show file tree

Hide file tree

Showing 2,323 changed files with 16,149 additions and 10,248 deletions.
diff --git a/.cmake-format.py b/.cmake-format.py
@@ -50,12 +50,6 @@
                 "DEPS": '*',
             }
         },
-        "hip_library": {
-            "kwargs": {
-                "SRCS": '*',
-                "DEPS": '*',
-            }
-        },
         "xpu_library": {
             "kwargs": {
                 "SRCS": '*',
@@ -68,12 +62,6 @@
                 "DEPS": '*',
             }
         },
-        "hip_library": {
-            "kwargs": {
-                "SRCS": '*',
-                "DEPS": '*',
-            }
-        },
         "go_library": {
             "kwargs": {
                 "SRCS": '*',

diff --git a/.flake8 b/.flake8
@@ -5,91 +5,35 @@ exclude =
     # A trick to exclude fluid/ but keep fluid/tests/, see more at
     # https://github.com/PaddlePaddle/Paddle/pull/46290#discussion_r976392010
     ./python/paddle/fluid/[!t]**,
-    ./python/paddle/fluid/tra**
+    ./python/paddle/fluid/tra**,
+    # Exclude auto-generated files
+    *_pb2.py,
+    # Exclude third-party libraries
+    ./python/paddle/utils/gast/**,
+    # Exclude files that will be removed in the future, see more at
+    # https://github.com/PaddlePaddle/Paddle/pull/46782#issuecomment-1273033731
+    ./python/paddle/fluid/tests/unittests/npu/**,
+    ./python/paddle/fluid/tests/unittests/mlu/**
 ignore =
     # E, see https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
     E121,E122,E123,E125,E126,E127,E128,E129,E131,
     E201,E202,E203,E225,E226,E228,E231,E241,E251,E261,E262,E265,E266,E271,E272,
-    E301,E302,E303,E305,E306,
+    E301,E302,E303,E305,
     E401,E402,
     E501,E502,
-    E701,E711,E712,E713,E714,E721,E722,E731,E741,
+    E701,E711,E712,E721,E722,E731,E741,
 
     # F, see https://flake8.pycqa.org/en/latest/user/error-codes.html
-    F402,F403,F405,
-    F522,F524,F541,
-    F601,F631,F632,
+    F403,F405,
+    F522,
     F811,F821,F841,
 
     # W, see https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
     W503,W504
-    W601,W605
+    W605
 per-file-ignores =
     # These files need tabs for testing.
     python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py:E101,W191
     python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py:E101,W191
     # Ignore unused imports in __init__.py
     __init__.py: F401
-    # These files will be fixed later
-    r/*:F401
-    cmake/*:F401
-    paddle/*:F401
-    tools/*:F401
-    python/paddle/signal.py:F401
-    python/paddle/common_ops_import.py:F401
-    python/paddle/check_import_scipy.py:F401
-    python/paddle/fft.py:F401
-    python/paddle/_C_ops.py:F401
-    python/paddle/framework/*:F401
-    python/paddle/reader/*:F401
-    python/paddle/nn/*:F401
-    python/paddle/distributed/*:F401
-    python/paddle/proto/*:F401
-    python/paddle/onnx/*:F401
-    python/paddle/optimizer/*:F401
-    python/paddle/hapi/*:F401
-    python/paddle/autograd/*:F401
-    python/paddle/dataset/*:F401
-    python/paddle/io/*:F401
-    python/paddle/cost_model/*:F401
-    python/paddle/tests/*:F401
-    python/paddle/distribution/*:F401
-    python/paddle/geometric/*:F401
-    python/paddle/utils/*:F401
-    python/paddle/vision/*:F401
-    python/paddle/quantization/*:F401
-    python/paddle/libs/*:F401
-    python/paddle/audio/*:F401
-    python/paddle/incubate/*:F401
-    python/paddle/amp/*:F401
-    python/paddle/jit/*:F401
-    python/paddle/static/*:F401
-    python/paddle/inference/*:F401
-    python/paddle/device/*:F401
-    python/paddle/profiler/*:F401
-    python/paddle/tensor/*:F401
-    python/paddle/text/*:F401
-    python/paddle/metric/*:F401
-    python/paddle/fluid/tests/custom_kernel/*:F401
-    python/paddle/fluid/tests/custom_runtime/*:F401
-    python/paddle/fluid/tests/unittests/ir/*:F401
-    python/paddle/fluid/tests/unittests/tokenizer/*:F401
-    python/paddle/fluid/tests/unittests/xpu/*:F401
-    python/paddle/fluid/tests/unittests/distribution/*:F401
-    python/paddle/fluid/tests/unittests/mlu/*:F401
-    python/paddle/fluid/tests/unittests/npu/*:F401
-    python/paddle/fluid/tests/unittests/ipu/*:F401
-    python/paddle/fluid/tests/unittests/distributed_passes/*:F401
-    python/paddle/fluid/tests/unittests/auto_parallel/*:F401
-    python/paddle/fluid/tests/unittests/dygraph_to_static/*:F401
-    python/paddle/fluid/tests/unittests/ps/*:F401
-    python/paddle/fluid/tests/unittests/fft/*:F401
-    python/paddle/fluid/tests/unittests/white_list/*:F401
-    python/paddle/fluid/tests/unittests/sequence/*:F401
-    python/paddle/fluid/tests/unittests/mkldnn/*:F401
-    python/paddle/fluid/tests/unittests/rnn/*:F401
-    python/paddle/fluid/tests/book/*:F401
-    python/paddle/fluid/tests/custom_op/*:F401
-    python/paddle/fluid/tests/unittests/test_*:F401
-    python/paddle/fluid/tests/test_*:F401
-    python/paddle/fluid/tests/*:F401
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,10 +1,13 @@
-# Exclude all third-party libraries globally
+# Exclude all third-party libraries and auto-generated files globally
 exclude: |
     (?x)^(
         patches/.+|
         paddle/fluid/framework/fleet/heter_ps/cudf/.+|
         paddle/fluid/distributed/ps/thirdparty/round_robin.h|
-        python/paddle/utils/gast/.+
+        python/paddle/utils/gast/.+|
+        .+_py2\.py|
+        python/paddle/fluid/tests/unittests/npu/.+|
+        python/paddle/fluid/tests/unittests/mlu/.+
     )$
 repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git

diff --git a/AUTHORS.md b/AUTHORS.md
@@ -114,3 +114,5 @@ This is an incomplete list of authors of [Paddle](https://github.com/PaddlePaddl
 | czr-gc | Zhaorui Chen (Graphcore) |
 | zhao-han | Han Zhao (Graphcore) |
 | yiakwy, yiakwy-xpu-ml-framework-team | Yi Wang (Graphcore) |
+| [Yulv-git](https://github.com/Yulv-git) | Shuangchi He |
+| [zrr1999](https://github.com/zrr1999) | Rongrui Zhan |
diff --git a/README.md b/README.md
@@ -89,8 +89,8 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
 
 ## Courses
 
-- [Server Deployments](https://aistudio.baidu.com/aistudio/course/introduce/19084): Courses intorducing high performance server deployments via local and remote services.
-- [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): Courses intorducing edge deployments from mobile, IoT to web and applets.
+- [Server Deployments](https://aistudio.baidu.com/aistudio/course/introduce/19084): Courses introducing high performance server deployments via local and remote services.
+- [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): Courses introducing edge deployments from mobile, IoT to web and applets.
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/README_cn.md b/README_cn.md
@@ -88,7 +88,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 ## 课程
 
 - [服务器部署](https://aistudio.baidu.com/aistudio/course/introduce/19084): 详细介绍高性能服务器端部署实操，包含本地端及服务化Serving部署等
-- [端侧部署](https://aistudio.baidu.com/aistudio/course/introduce/22690): 详细介绍端侧多场景部署实操，从移端端设备、IoT、网页到小程序部署
+- [端侧部署](https://aistudio.baidu.com/aistudio/course/introduce/22690): 详细介绍端侧多场景部署实操，从移动端设备、IoT、网页到小程序部署
 
 ## 版权和许可证
 PaddlePaddle由[Apache-2.0 license](LICENSE)提供
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -96,7 +96,7 @@ if(NOT APPLE AND NOT WIN32)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
   if(WITH_PSLIB OR WITH_DISTRIBUTE)
     set(CMAKE_CXX_LINK_EXECUTABLE
-        "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl")
+        "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl -lcrypto")
   else()
     set(CMAKE_CXX_LINK_EXECUTABLE
         "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")

diff --git a/cmake/make_resource.py b/cmake/make_resource.py
@@ -20,6 +20,7 @@
 out = sys.argv[2]
 var = re.sub(r'[ .-]', '_', os.path.basename(res))
 
-open(out, "w").write("const unsigned char " + var + "[] = {" + ",".join([
-    "0x%02x" % ord(c) for c in open(res).read()
-]) + ",0};\n" + "const unsigned " + var + "_size = sizeof(" + var + ");\n")
+open(out, "w").write("const unsigned char " + var + "[] = {" +
+                     ",".join(["0x%02x" % ord(c)
+                               for c in open(res).read()]) + ",0};\n" +
+                     "const unsigned " + var + "_size = sizeof(" + var + ");\n")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -518,13 +518,6 @@ function(op_library TARGET)
            "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
       file(APPEND ${pybind_file}
            "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
-    elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op")
-      file(APPEND ${pybind_file}
-           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n")
-      file(APPEND ${pybind_file}
-           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n")
-      file(APPEND ${pybind_file}
-           "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n")
     elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op")
       file(APPEND ${pybind_file}
            "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n")

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -125,6 +125,16 @@ class ProcessGroup {
         "ProcessGroup%s does not support broadcast", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast with sync_op flag",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -160,14 +170,14 @@ class ProcessGroup {
   virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
       phi::DenseTensor&,  // NOLINT
       int,
-      int,
-      int) {
+      int64_t,
+      int64_t) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support send_partial", GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
-      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support send_partial with sync_op flag",
         GetBackendName()));
@@ -176,14 +186,14 @@ class ProcessGroup {
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
       phi::DenseTensor&,  // NOLINT
       int,
-      int,
-      int) {
+      int64_t,
+      int64_t) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support recv_partial", GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
-      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support recv_partial with sync_op flag",
         GetBackendName()));
@@ -208,8 +218,18 @@ class ProcessGroup {
   virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
       std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
-      int offset,
-      int length) {  // NOLINT
+      int64_t offset,
+      int64_t length) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length,
+      bool) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
   }
@@ -221,6 +241,14 @@ class ProcessGroup {
         "ProcessGroup%s does not support AllToAll", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
@@ -230,26 +258,66 @@ class ProcessGroup {
         "ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall_single", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Reduce(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
       const ReduceOptions& opts) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support Reduce", GetBackendName()));
+        "ProcessGroup%s does not support reduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const ReduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Scatter(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
-      const ScatterOptions&) {         // NOLINT
+      const ScatterOptions&) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support Scatter", GetBackendName()));
+        "ProcessGroup%s does not support scatter", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce_scatter with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
-      phi::DenseTensor&,              // NOLINT
-      phi::DenseTensor&,              // NOLINT
-      const ReduceScatterOptions&) {  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      const ReduceScatterOptions&) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support ReduceScatter", GetBackendName()));
   }

diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
@@ -267,8 +267,8 @@ void* XcclGetPointerByOffset(void* raw_pointer,
 std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather_Partial(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
-    int offset,
-    int length) {
+    int64_t offset,
+    int64_t length) {
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCustomPlace(in_tensors, device_type_),
       true,