Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CPU forward calculation replaces Eigen with Lapack;Modify linalg exposure rules #35916

Merged
merged 9 commits into from
Sep 26, 2021
182 changes: 70 additions & 112 deletions paddle/fluid/operators/math/eigen_values_vectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

#pragma once

#include "Eigen/Core"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/svd_helper.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cusolver.h"
Expand All @@ -25,84 +25,6 @@ namespace paddle {
namespace operators {
namespace math {

template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using InputMatrixMap = Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;

template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using OutputMatrixMap = Eigen::Map<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;

template <typename ValueType>
inline void ComputeFloatEigenvaluesAndVectors(ValueType *x_data,
ValueType *eigenvalues_data,
ValueType *eigenvectors_data,
int batches, int rows, int cols,
bool has_vectors) {
int stride = rows * cols;
for (int i = 0; i < batches; i++) {
auto m = InputMatrixMap<ValueType>(x_data + i * stride, rows, cols);
auto eigenvalues =
OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
auto eigenvectors =
OutputMatrixMap<ValueType>(eigenvectors_data + i * stride, rows, cols);

Eigen::SelfAdjointEigenSolver<Eigen::Matrix<
ValueType, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
: Eigen::EigenvaluesOnly);
PADDLE_ENFORCE_EQ(
eigen_solver.info(), Eigen::Success,
platform::errors::InvalidArgument(
"Self Adjoint Eigen decomposition is not successful. "
"The %d-th input matrice might not be not be positive definite.",
i));

eigenvalues = eigen_solver.eigenvalues().transpose();
if (has_vectors) {
eigenvectors = eigen_solver.eigenvectors();
}
}
}

template <typename T, typename ValueType>
inline void ComputeComplexEigenvaluesAndVectors(T *x_data,
ValueType *eigenvalues_data,
T *eigenvectors_data,
int batches, int rows, int cols,
bool has_vectors) {
using Complex = std::complex<ValueType>;
Complex *input = reinterpret_cast<Complex *>(x_data);
Complex *eigenvectors_data_ = reinterpret_cast<Complex *>(eigenvectors_data);

int stride = rows * cols;
for (int i = 0; i < batches; i++) {
auto m = InputMatrixMap<Complex>(input + i * stride, rows, cols);
auto eigenvalues =
OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
auto eigenvectors =
OutputMatrixMap<Complex>(eigenvectors_data_ + i * stride, rows, cols);

Eigen::SelfAdjointEigenSolver<
Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
: Eigen::EigenvaluesOnly);
PADDLE_ENFORCE_EQ(
eigen_solver.info(), Eigen::Success,
platform::errors::InvalidArgument(
"Self Adjoint Eigen decomposition is not successful. "
"The %d-th input matrice might not be not be positive definite.",
i));

eigenvalues = eigen_solver.eigenvalues().transpose();
if (has_vectors) {
eigenvectors = eigen_solver.eigenvectors();
}
}
}

inline int64_t GetBatchSize(framework::DDim dims) {
int64_t batch_size = 1;
auto dim_size = dims.size();
Expand All @@ -128,37 +50,74 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, ValueType, T> {
void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

functor里面不再需要从ctx里面获取运行时信息,最好传DeviceContext

Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
bool has_vectors) {
auto dims = input.dims();
auto output_value_dim = eigen_values->dims();
auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());

int64_t batch_size = 1;
int dim_size = dims.size();
for (int64_t i = 0; i < dim_size - 2; i++) {
batch_size *= dims[i];
}
auto dito =
DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(ctx);
Tensor input_tensor;
TensorCopy(input, ctx.GetPlace(), &input_tensor);
if (!is_lower) {
input_tensor = dito.Transpose(input);
math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
ctx);
*eigen_vectors = dito.Transpose(input);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • lapack输入是列优先的?最好加一些注释说明,这里为什么要transpose。
  • 看了下svd_helper.h里面实现的Transpose,既然只针对最好2维进行transpose,其实没必要区分2-6 D,eigvals_op.h里面SpiltBatchSquareMatrix这种实现方式比较好。
  • EigvalsKernel里面,math::lapackEig<T, Real<T>>这种调用方式也比较好,能够清楚的体现values、vectors的数据类型的关系。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果has_vectorsfalse,好像是否转置得到的特征值都是一样的,所以这里可能不需要transpose?

auto *out_vector = eigen_vectors->mutable_data<T>(ctx.GetPlace());

auto dims = input.dims();
int dim_size = dims.size();
int64_t batch_size = GetBatchSize(dims);

int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
int values_stride = dims[dim_size - 1];
char uplo = is_lower ? 'L' : 'U';
char jobz = has_vectors ? 'V' : 'N';
auto n = dims[dim_size - 1];
auto lda = std::max<int64_t>(1, n);

int lwork = -1;
int lrwork = -1;
int liwork = -1;
int iwork_opt = -1;
T lwork_opt = static_cast<T>(-1);
ValueType rwork_opt = static_cast<ValueType>(-1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这些参数都是啥意思?-1代表什么?lapack函数中会写这些参数吗?可以加一些注释说明下。


Tensor info_tensor;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果一次只计算1个矩阵,貌似不用定义一个info_tensor,而是直接用int info就够?

auto *infos_data = info_tensor.mutable_data<int>(
framework::make_ddim({batch_size}), ctx.GetPlace());

math::lapackEvd<T, ValueType>(jobz, uplo, n, out_vector, lda, out_value,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

加个注释说明单独这一次lapack函数调用的目的是什么。

&lwork_opt, lwork, &rwork_opt, lrwork,
&iwork_opt, liwork, infos_data);

lwork = std::max<int>(1, static_cast<int>(lwork_opt));
liwork = std::max<int>(1, iwork_opt);

Tensor rwork_tensor;
ValueType *rwork_data = nullptr;

// complex type
if (framework::IsComplexType(eigen_vectors->type())) {
lrwork = std::max<int>(1, static_cast<int>(rwork_opt));
rwork_data = rwork_tensor.mutable_data<ValueType>(
framework::make_ddim({lrwork}), ctx.GetPlace());
}
int rows = dims[dims.size() - 2];

auto *value_data =
eigen_values->mutable_data<ValueType>(output_value_dim, ctx.GetPlace());
Tensor iwork_tensor, work_tensor;
auto *iwork_data = iwork_tensor.mutable_data<int>(
framework::make_ddim({liwork}), ctx.GetPlace());
auto *work_data = work_tensor.mutable_data<T>(framework::make_ddim({lwork}),
ctx.GetPlace());

if (framework::IsComplexType(input_tensor.type())) {
auto *x_data = input_tensor.data<T>();
auto *vector_data = eigen_vectors->mutable_data<T>(dims, ctx.GetPlace());
ComputeComplexEigenvaluesAndVectors<T, ValueType>(
x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
} else {
auto *x_data = input_tensor.data<ValueType>();
auto *vector_data =
eigen_vectors->mutable_data<ValueType>(dims, ctx.GetPlace());
ComputeFloatEigenvaluesAndVectors<ValueType>(
x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
for (auto i = 0; i < batch_size; i++) {
auto *value_data = out_value + i * values_stride;
auto *vector_data = out_vector + i * vector_stride;
int *info_ptr = &infos_data[i];
math::lapackEvd<T, ValueType>(jobz, uplo, n, vector_data, lda, value_data,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Evd是什么意思,会被用到别的计算吗,是否直接叫lapackEigh比较好?

work_data, lwork, rwork_data, lrwork,
iwork_data, liwork, info_ptr);
PADDLE_ENFORCE_EQ(
*info_ptr, 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: the [%d] argument had an illegal value", i,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个报错信息不太具有实际含义。

*info_ptr));
}
if (has_vectors) {
*eigen_vectors = dito.Transpose(*eigen_vectors);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

确认下,前后2个Transpose是必须的吗?

}
}
};
Expand All @@ -175,6 +134,12 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {
Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
bool has_vectors) {
auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());

auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto dito =
math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
T>(ctx);
*eigen_vectors = dito.Transpose(input);
auto *out_vector = eigen_vectors->mutable_data<T>(ctx.GetPlace());

auto &dims = input.dims();
Expand All @@ -191,13 +156,6 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, ValueType, T> {
auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
auto values_stride = dims[dim_size - 1];

auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto dito =
math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
T>(ctx);
Tensor output_v_var_trans = dito.Transpose(input);
TensorCopy(output_v_var_trans, ctx.GetPlace(), eigen_vectors);

int lwork = 0;
auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_size);
auto *info_ptr = reinterpret_cast<int *>(info->ptr());
Expand Down
44 changes: 44 additions & 0 deletions paddle/fluid/operators/math/lapack_function.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,50 @@ void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
}

// eigh
template <>
void lapackEvd<float, float>(char jobz, char uplo, int n, float *a, int lda,
float *w, float *work, int lwork, float *rwork,
int lrwork, int *iwork, int liwork, int *info) {
(void)rwork; // unused
(void)lrwork; // unused
platform::dynload::ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
&liwork, info);
}

template <>
void lapackEvd<double, double>(char jobz, char uplo, int n, double *a, int lda,
double *w, double *work, int lwork,
double *rwork, int lrwork, int *iwork,
int liwork, int *info) {
(void)rwork; // unused
(void)lrwork; // unused
platform::dynload::dsyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
&liwork, info);
}

template <>
void lapackEvd<paddle::platform::complex<float>, float>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里可不必加paddle::

char jobz, char uplo, int n, paddle::platform::complex<float> *a, int lda,
float *w, paddle::platform::complex<float> *work, int lwork, float *rwork,
int lrwork, int *iwork, int liwork, int *info) {
platform::dynload::cheevd_(&jobz, &uplo, &n,
reinterpret_cast<std::complex<float> *>(a), &lda,
w, reinterpret_cast<std::complex<float> *>(work),
&lwork, rwork, &lrwork, iwork, &liwork, info);
}

template <>
void lapackEvd<paddle::platform::complex<double>, double>(
char jobz, char uplo, int n, paddle::platform::complex<double> *a, int lda,
double *w, paddle::platform::complex<double> *work, int lwork,
double *rwork, int lrwork, int *iwork, int liwork, int *info) {
platform::dynload::zheevd_(&jobz, &uplo, &n,
reinterpret_cast<std::complex<double> *>(a), &lda,
w, reinterpret_cast<std::complex<double> *>(work),
&lwork, rwork, &lrwork, iwork, &liwork, info);
}

// Eig
template <>
void lapackEig<double>(char jobvl, char jobvr, int n, double *a, int lda,
Expand Down
13 changes: 9 additions & 4 deletions paddle/fluid/operators/math/lapack_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,17 @@ namespace math {

// LU (for example)
template <typename T>
void lapackLu(int m, int n, T *a, int lda, int *ipiv, int *info);
void lapackLu(int m, int n, T* a, int lda, int* ipiv, int* info);

template <typename T, typename ValueType = T>
void lapackEvd(char jobz, char uplo, int n, T* a, int lda, ValueType* w,
T* work, int lwork, ValueType* rwork, int lrwork, int* iwork,
int liwork, int* info);

template <typename T1, typename T2 = T1>
void lapackEig(char jobvl, char jobvr, int n, T1 *a, int lda, T1 *w, T1 *vl,
int ldvl, T1 *vr, int ldvr, T1 *work, int lwork, T2 *rwork,
int *info);
void lapackEig(char jobvl, char jobvr, int n, T1* a, int lda, T1* w, T1* vl,
int ldvl, T1* vr, int ldvr, T1* work, int lwork, T2* rwork,
int* info);

} // namespace math
} // namespace operators
Expand Down
21 changes: 21 additions & 0 deletions paddle/fluid/platform/dynload/lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License. */

#include <complex>
#include <mutex>
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"

Expand All @@ -28,6 +29,22 @@ extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
int *info);

// evd
extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
int *lda, double *w, std::complex<double> *work,
int *lwork, double *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
int *lda, float *w, std::complex<float> *work,
int *lwork, float *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
double *w, double *work, int *lwork, int *iwork,
int *liwork, int *info);
extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
float *w, float *work, int *lwork, int *iwork,
int *liwork, int *info);

// geev
extern "C" void dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
double *wr, double *wi, double *vl, int *ldvl,
Expand Down Expand Up @@ -81,6 +98,10 @@ extern void *lapack_dso_handle;
#define LAPACK_ROUTINE_EACH(__macro) \
__macro(dgetrf_); \
__macro(sgetrf_); \
__macro(zheevd_); \
__macro(cheevd_); \
__macro(dsyevd_); \
__macro(ssyevd_); \
__macro(dgeev_); \
__macro(sgeev_); \
__macro(zgeev_); \
Expand Down
1 change: 0 additions & 1 deletion python/paddle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@
from .tensor.linalg import multi_dot # noqa: F401
from .tensor.linalg import matrix_power # noqa: F401
from .tensor.linalg import svd # noqa: F401
from .tensor.linalg import eigh # noqa: F401
from .tensor.linalg import pinv # noqa: F401
from .tensor.logic import equal # noqa: F401
from .tensor.logic import greater_equal # noqa: F401
Expand Down
2 changes: 1 addition & 1 deletion python/paddle/tensor/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -1759,7 +1759,7 @@ def eigh(x, UPLO='L', name=None):

x_data = np.array([[1, -2j], [2j, 5]])
x = paddle.to_tensor(x_data)
out_value, out_vector = paddle.eigh(x, UPLO='L')
out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
print(out_value)
#[0.17157288, 5.82842712]
print(out_vector)
Expand Down