add OP lu forward (#38559)

LGTM
PaddlePaddle · Dec 30, 2021 · 4e21457 · 4e21457
1 parent 790cadd
commit 4e21457
Show file tree

Hide file tree

Showing 7 changed files with 973 additions and 0 deletions.
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -197,6 +197,7 @@ function(op_library TARGET)
         list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
         list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
         list(REMOVE_ITEM hip_srcs "cholesky_solve_op.cu")
+        list(REMOVE_ITEM hip_srcs "lu_op.cu")
         list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu")
         list(REMOVE_ITEM hip_srcs "svd_op.cu")
         list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu")

diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/lu_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddComment(R"DOC(LU decomposition, 
+                Computes the LU factorization of a matrix or batches of matrices A.
+                )DOC");
+    AddInput("X", "(Tensor) The input tensor, shape of (*,m,n)");
+    AddOutput("Out", "(Tensor) The output tensor, shape same to X");
+    AddOutput("Pivots",
+              "Stores all the intermediate transpositions of rows. shape of "
+              "(*,min(m,n))");
+    AddOutput("Infos",
+              "(Tensor) This is a tensor of size (*) where non-zero values "
+              "indicate whether factorization for the matrix has succeeded");
+    AddAttr<bool>("pivots", "Whether pivoting is done").SetDefault(true);
+  }
+};
+
+class LUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "LU");
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "LU");
+    bool pivots = context->Attrs().Get<bool>("pivots");
+    auto x_dims = context->GetInputDim("X");
+    int x_rank = x_dims.size();
+    PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument(
+                                     "the rank of input must greater than 2"));
+    context->SetOutputDim("Out", x_dims);
+    int m = x_dims[x_rank - 1];
+    int n = x_dims[x_rank - 2];
+    int min_mn = std::min(m, n);
+    auto dims_vec = framework::vectorize(x_dims);
+    OP_INOUT_CHECK(context->HasOutput("Infos"), "Output", "Infos", "LU");
+    if (x_rank == 2) {
+      auto Infos_dim = std::vector<int>(1);
+      context->SetOutputDim("Infos", framework::make_ddim(Infos_dim));
+    } else {
+      auto Infos_dim =
+          std::vector<int>(dims_vec.begin(), dims_vec.begin() + x_rank - 2);
+      context->SetOutputDim("Infos", framework::make_ddim(Infos_dim));
+    }
+    if (pivots) {
+      OP_INOUT_CHECK(context->HasOutput("Pivots"), "Output", "Pivots", "LU");
+      auto Pivots_dim =
+          std::vector<int>(dims_vec.begin(), dims_vec.begin() + x_rank - 1);
+      Pivots_dim[x_rank - 2] = min_mn;
+      context->SetOutputDim("Pivots", framework::make_ddim(Pivots_dim));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class LUOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+
+    ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
+
+    ctx->SetOutputType("Pivots", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Pivots", framework::proto::VarType::INT32,
+                           framework::ALL_ELEMENTS);
+
+    ctx->SetOutputType("Infos", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Infos", framework::proto::VarType::INT32,
+                           framework::ALL_ELEMENTS);
+  }
+};
+
+template <typename T>
+class LUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto pivots = ctx.Attr<bool>("pivots");
+    auto *xin = ctx.Input<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    auto *IpivT = ctx.Output<framework::Tensor>("Pivots");
+    auto *InfoT = ctx.Output<framework::Tensor>("Infos");
+    PADDLE_ENFORCE_EQ(pivots, true,
+                      platform::errors::InvalidArgument(
+                          "lu without pivoting is not implemented on the CPU, "
+                          "but got pivots=False"));
+
+    math::DeviceIndependenceTensorOperations<paddle::platform::CPUDeviceContext,
+                                             T>
+        helper(ctx);
+    *out = helper.Transpose(*xin);
+
+    auto outdims = out->dims();
+    auto outrank = outdims.size();
+
+    int m = static_cast<int>(outdims[outrank - 1]);
+    int n = static_cast<int>(outdims[outrank - 2]);
+    int lda = std::max(1, m);
+
+    auto ipiv_dims = slice_ddim(outdims, 0, outrank - 1);
+    ipiv_dims[outrank - 2] = std::min(m, n);
+    IpivT->Resize(ipiv_dims);
+    auto ipiv_data = IpivT->mutable_data<int>(ctx.GetPlace());
+
+    auto info_dims = slice_ddim(outdims, 0, outrank - 2);
+    if (info_dims.size() == 0) {
+      info_dims = framework::make_ddim({1});
+    }
+    InfoT->Resize(info_dims);
+    auto info_data = InfoT->mutable_data<int>(ctx.GetPlace());
+
+    auto batchsize = product(info_dims);
+    batchsize = std::max(static_cast<int>(batchsize), 1);
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    for (int b = 0; b < batchsize; b++) {
+      auto out_data_item = &out_data[b * m * n];
+      int *info_data_item = &info_data[b];
+      int *ipiv_data_item = &ipiv_data[b * std::min(m, n)];
+      math::lapackLu<T>(m, n, out_data_item, lda, ipiv_data_item,
+                        info_data_item);
+    }
+    *out = helper.Transpose(*out);
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(LUOpInplaceInferer, {"X", "Out"});
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(lu, ops::LUOp, ops::LUOpMaker, ops::LUOpVarTypeInference,
+                  ops::LUOpInplaceInferer);
+
+REGISTER_OP_CPU_KERNEL(lu, ops::LUKernel<float>, ops::LUKernel<double>);
diff --git a/paddle/fluid/operators/lu_op.cu b/paddle/fluid/operators/lu_op.cu
@@ -0,0 +1,156 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_WITH_HIP
+// HIP not support cusolver
+
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/lu_op.h"
+#include "paddle/fluid/platform/dynload/cusolver.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void cusolver_bufferSize(const cusolverDnHandle_t& cusolverH, int m, int n,
+                         T* d_A, int lda, int* lwork);
+template <typename T>
+void cusolver_getrf(const cusolverDnHandle_t& cusolverH, int m, int n, T* d_A,
+                    int lda, T* d_work, int* d_Ipiv, int* d_info);
+
+template <>
+void cusolver_bufferSize<float>(const cusolverDnHandle_t& cusolverH, int m,
+                                int n, float* d_A, int lda, int* lwork) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgetrf_bufferSize(
+      cusolverH, m, n, d_A, lda, lwork));
+}
+
+template <>
+void cusolver_bufferSize<double>(const cusolverDnHandle_t& cusolverH, int m,
+                                 int n, double* d_A, int lda, int* lwork) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgetrf_bufferSize(
+      cusolverH, m, n, d_A, lda, lwork));
+}
+
+template <>
+void cusolver_getrf<float>(const cusolverDnHandle_t& cusolverH, int m, int n,
+                           float* d_A, int lda, float* d_work, int* d_Ipiv,
+                           int* d_info) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgetrf(
+      cusolverH, m, n, d_A, lda, d_work, d_Ipiv, d_info));
+}
+
+template <>
+void cusolver_getrf<double>(const cusolverDnHandle_t& cusolverH, int m, int n,
+                            double* d_A, int lda, double* d_work, int* d_Ipiv,
+                            int* d_info) {
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgetrf(
+      cusolverH, m, n, d_A, lda, d_work, d_Ipiv, d_info));
+}
+
+template <typename T>
+void lu_decomposed_kernel(int m, int n, T* d_A, int lda, int* d_Ipiv,
+                          int* d_info, const framework::ExecutionContext& ctx) {
+  /* step 1: get cusolver handle*/
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto cusolverH = dev_ctx.cusolver_dn_handle();
+
+  /* step 2: query working space of getrf */
+  int lwork;
+  cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork);
+
+  auto work_buff = memory::Alloc(dev_ctx, lwork * sizeof(T));
+  T* d_work = reinterpret_cast<T*>(work_buff->ptr());
+
+  /* step 3: LU factorization */
+  if (d_Ipiv) {
+    cusolver_getrf(cusolverH, m, n, d_A, lda, d_work, d_Ipiv, d_info);
+  } else {
+    cusolver_getrf(cusolverH, m, n, d_A, lda, d_work, NULL, d_info);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+}
+
+template <typename T>
+class LUCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#ifdef __HIPCC__
+    const int64_t kMaxBlockDim = 256;
+#else
+    const int64_t kMaxBlockDim = 512;
+#endif
+    auto* xin = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* IpivT = ctx.Output<framework::Tensor>("Pivots");
+    auto* InfoT = ctx.Output<framework::Tensor>("Infos");
+    auto pivots = ctx.Attr<bool>("pivots");
+
+    math::DeviceIndependenceTensorOperations<
+        paddle::platform::CUDADeviceContext, T>
+        helper(ctx);
+    *out = helper.Transpose(*xin);
+
+    auto outdims = out->dims();
+    auto outrank = outdims.size();
+
+    int m = static_cast<int>(outdims[outrank - 1]);
+    int n = static_cast<int>(outdims[outrank - 2]);
+    int lda = std::max(1, m);
+    if (pivots) {
+      auto ipiv_dims = slice_ddim(outdims, 0, outrank - 1);
+      ipiv_dims[outrank - 2] = std::min(m, n);
+      IpivT->Resize(ipiv_dims);
+    }
+    auto ipiv_data = IpivT->mutable_data<int>(ctx.GetPlace());
+
+    auto info_dims = slice_ddim(outdims, 0, outrank - 2);
+    if (info_dims.size() == 0) {
+      info_dims = framework::make_ddim({1});
+    }
+    InfoT->Resize(info_dims);
+    auto info_data = InfoT->mutable_data<int>(ctx.GetPlace());
+
+    auto batchsize = product(info_dims);
+    batchsize = std::max(static_cast<int>(batchsize), 1);
+    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    for (int b = 0; b < batchsize; b++) {
+      auto out_data_item = &out_data[b * m * n];
+      int* info_data_item = &info_data[b];
+      if (pivots) {
+        auto ipiv_data_item = &ipiv_data[b * std::min(m, n)];
+        lu_decomposed_kernel(m, n, out_data_item, lda, ipiv_data_item,
+                             info_data_item, ctx);
+      } else {
+        lu_decomposed_kernel(m, n, out_data_item, lda, NULL, info_data_item,
+                             ctx);
+      }
+    }
+    *out = helper.Transpose(*out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(lu, ops::LUCUDAKernel<float>,
+                        ops::LUCUDAKernel<double>);
+
+#endif  // not PADDLE_WITH_HIP