From adb8049460b3c14b0d0422fdc2fa10547fc9e912 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 15 Oct 2021 22:54:11 +0800 Subject: [PATCH 001/116] Remove wrong __restrict__ of CUDA LarsMomentumOpKernel (#36460) * remove wrong restrict * remove master_param_out __restrict__ * update --- .../operators/optimizers/lars_momentum_op.cu | 104 ++++++------------ 1 file changed, 31 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index b640e62221f77..89326679d5d50 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -84,22 +84,18 @@ class LarsThreadConfig { template __device__ inline void VectorizeLarsUpdate( - const T* __restrict__ grad, const MT* __restrict__ param, - const MT* __restrict__ velocity, T* __restrict__ param_out, - MT* __restrict__ velocity_out, const MT mu, MT local_lr, + const T* __restrict__ grad, const MT* param, const MT* velocity, + T* param_out, MT* velocity_out, const MT mu, MT local_lr, const MT lars_weight_decay, const MT rescale_grad, const int tid, - const int grid_stride, const int numel, - MT* __restrict__ master_param_out = nullptr) { + const int grid_stride, const int numel, MT* master_param_out = nullptr) { using VecType = paddle::platform::AlignedVector; using VecMType = paddle::platform::AlignedVector; int main = numel >> (VecSize >> 1); int tail_offset = main * VecSize; - const VecType* __restrict__ grad_vec = reinterpret_cast(grad); - const VecMType* __restrict__ param_vec = - reinterpret_cast(param); - const VecMType* __restrict__ velocity_vec = - reinterpret_cast(velocity); + const VecType* grad_vec = reinterpret_cast(grad); + const VecMType* param_vec = reinterpret_cast(param); + const VecMType* velocity_vec = reinterpret_cast(velocity); VecType* param_out_vec = reinterpret_cast(param_out); VecMType* velocity_out_vec = reinterpret_cast(velocity_out); @@ -157,66 +153,30 @@ __forceinline__ __device__ void L2NormKernel( template __global__ void L2NormKernel( #endif - const T* __restrict__ p_data, const T* __restrict__ g_data, - MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const int64_t numel, - const int repeat_times, const MT rescale_grad, const int thresh = 0, - MT* __restrict__ p_n = nullptr, MT* __restrict__ g_n = nullptr) { + const T* p_data, const T* __restrict__ g_data, MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, const int64_t numel, const int repeat_times, + const MT rescale_grad, const int thresh = 0, MT* __restrict__ p_n = nullptr, + MT* __restrict__ g_n = nullptr) { __shared__ MT s_buffer[2]; int tid = threadIdx.x + blockDim.x * blockIdx.x; int grid_stride = LARS_BLOCK_SIZE * gridDim.x; const MT rescale_pow = rescale_grad * rescale_grad; - if (threadIdx.x == 0) { - s_buffer[0] = static_cast(0); - s_buffer[1] = static_cast(0); - } + MT p_tmp = static_cast(0); MT g_tmp = static_cast(0); - - if (repeat_times == 0) { - if (tid < numel) { - p_tmp = static_cast(p_data[tid]); - g_tmp = static_cast(g_data[tid]); - } - MT tmp0 = math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); - MT tmp1 = math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); - if (threadIdx.x == 0) { - s_buffer[0] += tmp0; - s_buffer[1] += tmp1; - } - } else { - /* Avoid occupy too much temp buffer. Slice the whole data into 2 parts, - the front of data whose quantity is excatly multiple of grid-thread - number, and delt in for loop, the rest is delt with another step. */ - for (int i = 0; i < repeat_times; ++i) { - p_tmp = static_cast(p_data[tid]); - g_tmp = static_cast(g_data[tid]); - tid += grid_stride; - MT tmp0 = math::blockReduceSum(p_tmp * p_tmp, FINAL_MASK); - MT tmp1 = math::blockReduceSum(g_tmp * g_tmp, FINAL_MASK); - if (threadIdx.x == 0) { - s_buffer[0] += tmp0; - s_buffer[1] += tmp1; - } - __syncthreads(); - } - MT p_val = 0; - MT g_val = 0; - if (tid < numel) { - p_val = static_cast(p_data[tid]); - g_val = static_cast(g_data[tid]); - } - MT tmp0 = math::blockReduceSum(p_val * p_val, FINAL_MASK); - MT tmp1 = math::blockReduceSum(g_val * g_val, FINAL_MASK); - if (threadIdx.x == 0) { - s_buffer[0] += tmp0; - s_buffer[1] += tmp1; - } + while (tid < numel) { + MT tmp0 = static_cast(p_data[tid]); + MT tmp1 = static_cast(g_data[tid]); + p_tmp += (tmp0 * tmp0); + g_tmp += (tmp1 * tmp1); + tid += grid_stride; } - __syncthreads(); + p_tmp = math::blockReduceSum(p_tmp, FINAL_MASK); + g_tmp = math::blockReduceSum(g_tmp, FINAL_MASK); if (threadIdx.x == 0) { - p_buffer[blockIdx.x] = s_buffer[0]; - g_buffer[blockIdx.x] = s_buffer[1]; + p_buffer[blockIdx.x] = p_tmp; + g_buffer[blockIdx.x] = g_tmp; } #if CUDA_VERSION >= 11000 cg->sync(); // Grid sync for writring partial result to gloabl memory @@ -236,10 +196,9 @@ __global__ void L2NormKernel( template __forceinline__ __device__ void MomentumUpdate( - const T* __restrict__ param, const T* __restrict__ grad, - const MT* __restrict__ velocity, T* param_out, MT* velocity_out, - const MT* __restrict__ master_param, MT* __restrict__ master_param_out, - const MT* __restrict__ learning_rate, const MT mu, + const T* param, const T* __restrict__ grad, const MT* velocity, + T* param_out, MT* velocity_out, const MT* master_param, + MT* master_param_out, const MT* __restrict__ learning_rate, const MT mu, const MT lars_weight_decay, const MT lars_coeff, const MT epsilon, const MT rescale_grad, const MT param_norm, const MT grad_norm, const int tid, const int grid_stride, const int64_t numel, @@ -316,14 +275,13 @@ __global__ void MergedMomentumLarsKernel(LarsParamWarpper lars_warpper, template __global__ void MomentumLarsKernel( - const T* __restrict__ param, const T* __restrict__ grad, - const MT* __restrict__ velocity, T* param_out, MT* velocity_out, - const MT* __restrict__ master_param, MT* __restrict__ master_param_out, - const MT* __restrict__ learning_rate, MT* __restrict__ p_buffer, - MT* __restrict__ g_buffer, const MT mu, const MT lars_coeff, - const MT lars_weight_decay, const MT epsilon, const MT rescale_grad, - const int repeat_times, const int thresh, const int64_t numel, - const bool is_amp) { + const T* param, const T* __restrict__ grad, const MT* velocity, + T* param_out, MT* velocity_out, const MT* master_param, + MT* master_param_out, const MT* __restrict__ learning_rate, + MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const MT mu, + const MT lars_coeff, const MT lars_weight_decay, const MT epsilon, + const MT rescale_grad, const int repeat_times, const int thresh, + const int64_t numel, const bool is_amp) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int grid_stride = gridDim.x * LARS_BLOCK_SIZE; #if CUDA_VERSION >= 11000 From 0452f27cba16b6e152ec3a39b581e5588ec74d2b Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Sat, 16 Oct 2021 12:48:38 +0800 Subject: [PATCH 002/116] fix the initializer of resnet unit op (#36483) * fix the initializer of resnet unit op * fix the initializer of resnet unit op --- python/paddle/incubate/operators/resnet_unit.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py index cba1d4863cbd4..f2f391bdca946 100644 --- a/python/paddle/incubate/operators/resnet_unit.py +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -187,9 +187,7 @@ def _get_default_param_initializer(channels): filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z] self.filter_x = self.create_parameter( - shape=filter_x_shape, - attr=filter_x_attr, - default_initializer=_get_default_param_initializer(num_channels_x)) + shape=filter_x_shape, attr=filter_x_attr, default_initializer=None) self.scale_x = self.create_parameter( shape=bn_param_shape, attr=scale_x_attr, @@ -220,8 +218,7 @@ def _get_default_param_initializer(channels): self.filter_z = self.create_parameter( shape=filter_z_shape, attr=filter_z_attr, - default_initializer=_get_default_param_initializer( - num_channels_z)) + default_initializer=None) self.scale_z = self.create_parameter( shape=bn_param_shape, attr=scale_z_attr, From 314cc4952474c8105176a1f1988d3ffb812a154d Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Sun, 17 Oct 2021 16:40:05 +0800 Subject: [PATCH 003/116] Revert "fix the initializer of resnet unit op (#36483)" (#36487) This reverts commit 0452f27cba16b6e152ec3a39b581e5588ec74d2b. --- python/paddle/incubate/operators/resnet_unit.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py index f2f391bdca946..cba1d4863cbd4 100644 --- a/python/paddle/incubate/operators/resnet_unit.py +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -187,7 +187,9 @@ def _get_default_param_initializer(channels): filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z] self.filter_x = self.create_parameter( - shape=filter_x_shape, attr=filter_x_attr, default_initializer=None) + shape=filter_x_shape, + attr=filter_x_attr, + default_initializer=_get_default_param_initializer(num_channels_x)) self.scale_x = self.create_parameter( shape=bn_param_shape, attr=scale_x_attr, @@ -218,7 +220,8 @@ def _get_default_param_initializer(channels): self.filter_z = self.create_parameter( shape=filter_z_shape, attr=filter_z_attr, - default_initializer=None) + default_initializer=_get_default_param_initializer( + num_channels_z)) self.scale_z = self.create_parameter( shape=bn_param_shape, attr=scale_z_attr, From 4e036fa1a0c21b5b089809f575d37b2a0e6538da Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Sun, 17 Oct 2021 23:01:23 +0800 Subject: [PATCH 004/116] refine rescale_grad (#36490) --- paddle/fluid/operators/optimizers/lars_momentum_op.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 89326679d5d50..2c27a2135c14b 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -160,7 +160,6 @@ __global__ void L2NormKernel( __shared__ MT s_buffer[2]; int tid = threadIdx.x + blockDim.x * blockIdx.x; int grid_stride = LARS_BLOCK_SIZE * gridDim.x; - const MT rescale_pow = rescale_grad * rescale_grad; MT p_tmp = static_cast(0); MT g_tmp = static_cast(0); @@ -190,7 +189,7 @@ __global__ void L2NormKernel( } __syncthreads(); *p_n = Sqrt(s_buffer[0]); - *g_n = Sqrt(rescale_pow * s_buffer[1]); + *g_n = rescale_grad * Sqrt(s_buffer[1]); #endif } From e496d1e9b05906b38e2e5d424b6d4ad571ff678f Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Mon, 18 Oct 2021 10:46:30 +0800 Subject: [PATCH 005/116] modify ut of cond (#36475) --- python/paddle/fluid/tests/unittests/test_linalg_cond.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py index 237c96430249b..d13bdd676b48e 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py @@ -32,7 +32,8 @@ def test_static_assert_true(self, x_list, p_list): exe = static.Executor() result = exe.run(feed={"X": x}, fetch_list=[output]) expected_output = np.linalg.cond(x, p) - self.assertTrue(np.allclose(result, expected_output)) + np.testing.assert_allclose( + result[0], expected_output, rtol=5e-5) def test_dygraph_assert_true(self, x_list, p_list): @@ -41,7 +42,8 @@ def test_dygraph_assert_true(self, x_list, p_list): input_tensor = paddle.to_tensor(x) output = paddle.linalg.cond(input_tensor, p) expected_output = np.linalg.cond(x, p) - self.assertTrue(np.allclose(output, expected_output)) + np.testing.assert_allclose( + output.numpy(), expected_output, rtol=5e-5) def gen_input(): @@ -156,5 +158,4 @@ def test_dygraph_empty_tensor_input(self): if __name__ == "__main__": paddle.enable_static() - # paddle.device.set_device("cpu") unittest.main() From 79dbbcced6da823187432dd5f3a40a95b0e864c7 Mon Sep 17 00:00:00 2001 From: Tongxin Bai Date: Mon, 18 Oct 2021 11:01:59 +0800 Subject: [PATCH 006/116] [autograd.functional] Fix a bug on handling v=None in vjp and jvp (#36445) * autograd.functional passed pylint checker. * autograd.functional: fix import errors. * autograd.functional: fixed unit tests. * autograd.functional minor format change * [autograd.functional] Fixed vjp and jvp's v=None bug. --- python/paddle/autograd/functional.py | 19 +++++++++++------ .../tests/unittests/autograd/test_vjp_jvp.py | 21 +++++++++++++++++++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 17c7ad5b18af5..66ae1562edb68 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -23,10 +23,11 @@ @contextlib.contextmanager def gradient_scope(*var_lists, create_graph=False, allow_unused=False): - def grad_fn(ys, xs, v, create_graph=create_graph): - assert len(ys) == len(v), ( - f'`v` is expected to be of the same size as the output. ' - f'Here the output is {ys}, and `v` is {v}.') + def grad_fn(ys, xs, v=None, create_graph=create_graph): + if v is not None: + assert len(ys) == len(v), ( + f'The argument {v} is expected to be of the same size as the output. ' + f'Here the output is {ys}, and `v` is {v}.') if allow_unused: ys = [ to_tensor( @@ -49,6 +50,8 @@ def return_fn(out): return out def process(vl): + if vl is None: + return None out = [] # If v is treated as constant in the outer scope, its gradient is guaranteed # not to be taken beyond this scope. Within this scope, however, v's gradient @@ -151,7 +154,9 @@ def func_unused(x, y): # [[2., 1.], # [1., 0.]]), None] """ - xs, v = _tensors(inputs, "inputs"), _tensors(v, "v") + xs = _tensors(inputs, "inputs") + if v is not None: + v = _tensors(v, "v") with gradient_scope( xs, v, create_graph=create_graph, @@ -221,7 +226,9 @@ def func(x): # [0., 0.]])] """ - xs, v = _tensors(inputs, "inputs"), _tensors(v, "v") + xs = _tensors(inputs, "inputs") + if v is not None: + v = _tensors(v, "v") with gradient_scope( xs, v, create_graph=create_graph, diff --git a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py index f3680ab2a6223..c228ad79321d4 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py @@ -205,6 +205,16 @@ def test_vjp_i2o2_no_create_graph(self): vjp_result, grad_result = vjp(), grad() self.check_results(grad_result, vjp_result) + def test_vjp_i2o2_omitting_v_no_create_graph(self): + test_cases = [ + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + def test_vjp_nested_no_create_graph(self): x = self.gen_input('a') test_cases = [ @@ -289,6 +299,17 @@ def test_jvp_i2o2_no_create_graph(self): reverse_jac = jac(vjp, f, inputs) self.check_results(forward_jac, reverse_jac) + def test_jvp_i2o2_omitting_v_no_create_graph(self): + test_cases = [ #noqa + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + results_omitting_v = jvp(f, inputs) + v = [ones_like(x) for x in inputs] + results_with_v = jvp(f, inputs, v) + self.check_results(results_omitting_v, results_with_v) + if __name__ == "__main__": unittest.main() From d3c9394202579ab65bedfb3cbe0cc058a410f600 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Sun, 17 Oct 2021 22:22:30 -0500 Subject: [PATCH 007/116] Fix conv2d op_teller error (#36474) --- paddle/fluid/inference/tensorrt/op_teller.cc | 24 +++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 59368a299c59e..89159c0bb636c 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -242,9 +242,31 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (desc.HasAttr("padding_algorithm")) { auto padding_algorithm = BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm")); - if (padding_algorithm == "SAME" || padding_algorithm == "VALID") { + if (padding_algorithm == "VALID") { return false; } + if (padding_algorithm == "SAME") { + if (desc.HasAttr("dilations")) { + const std::vector dilations = + BOOST_GET_CONST(std::vector, desc.GetAttr("dilations")); + if (dilations[0] != 1 || dilations[1] != 1) { + VLOG(3) << "In Same mode, Dilations must be (1, 1) for " + "tensorRT, but given (" + << dilations[0] << ", " << dilations[1] << ")"; + return false; + } + } + } + } + + if (use_no_calib_int8) { + if (desc.HasAttr("padding_algorithm")) { + auto padding_algorithm = + BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm")); + if (padding_algorithm == "SAME") { + return false; + } + } } if (desc.HasAttr("enable_int8")) { From d19a9b3954f7e29356410824213806b7e27d37e4 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Mon, 18 Oct 2021 11:24:04 +0800 Subject: [PATCH 008/116] [XPU AMP] 1. xpu support gradient acc 2. xpu support create tensor in dygraph 3. xpu support update weight params in amp (#36439) --- .../fluid/imperative/gradient_accumulator.cc | 47 ++++- .../reduce_ops/reduce_mean_op_xpu.cc | 99 ++++++++-- paddle/fluid/operators/slice_op_xpu.cc | 174 ++++++++---------- paddle/fluid/platform/xpu/xpu2_op_list.h | 11 +- python/paddle/fluid/framework.py | 12 ++ python/paddle/optimizer/adamw.py | 7 - python/paddle/tensor/creation.py | 4 +- 7 files changed, 238 insertions(+), 116 deletions(-) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index fbc5453f82146..fd6a070c3fc52 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -87,9 +87,17 @@ class TensorAddFunctor : public boost::static_visitor<> { #ifdef PADDLE_WITH_XPU void operator()(const platform::XPUPlace& place) { + using XPUType = typename XPUTypeTrait::Type; platform::XPUDeviceContext* ctx = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)); - xpu::add(ctx->x_context(), x_, y_, y_, static_cast(numel_)); + int r = xpu::add( + ctx->x_context(), reinterpret_cast(x_), + reinterpret_cast(y_), reinterpret_cast(y_), + static_cast(numel_)); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU add kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } #else void operator()(const platform::XPUPlace& place) { @@ -154,6 +162,24 @@ class TensorAddFunctor : public boost::static_visitor<> { T* y_; }; +#ifdef PADDLE_WITH_XPU +template +void XPUTensorAddFunctor(const platform::Place& place, + const framework::Tensor& src, framework::Tensor* dst) { + using XPUType = typename XPUTypeTrait::Type; + platform::XPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + const XPUType* x = reinterpret_cast(src.data()); + XPUType* y = reinterpret_cast(dst->mutable_data(place)); + int r = xpu::add(ctx->x_context(), x, y, y, + static_cast(src.numel())); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU add kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); +} +#endif + template void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst, const platform::Place& place) { @@ -226,7 +252,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { return; } #endif + +#ifdef PADDLE_WITH_XPU + if (platform::is_xpu_place(place)) { + if (data_type == framework::DataTypeTrait::DataType()) { + XPUTensorAddFunctor(place, src_tensor, dst_tensor); + } else if (data_type == + framework::DataTypeTrait::DataType()) { + XPUTensorAddFunctor(place, src_tensor, dst_tensor); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); + } + return; + } +#endif + PADDLE_TENSOR_ADD(float); + #ifndef PADDLE_WITH_XPU // NOTE(phlrain): xpu only support float PADDLE_TENSOR_ADD(double); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc index b82ecbbe2fcdc..d6c1dc5f02d42 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc @@ -23,30 +23,103 @@ namespace paddle { namespace operators { template class ReduceMeanXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { PADDLE_ENFORCE_EQ( platform::is_xpu_place(context.GetPlace()), true, platform::errors::Unavailable("This kernel only runs on XPU.")); - // bool reduce_all = context.Attr("reduce_all"); + bool reduce_all = context.Attr("reduce_all"); auto* input = context.Input("X"); auto* output = context.Output("Out"); output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - int ndim = input->dims().size(); - std::vector idims; + + std::vector xdims; for (int i = 0; i < input->dims().size(); i++) { - idims.push_back(input->dims()[i]); + xdims.push_back(input->dims()[i]); } - auto dims = context.Attr>("dim"); - int rdim = dims.size(); - int r = - xpu::reduce(dev_ctx.x_context(), input->data(), output->data(), - idims.data(), ndim, dims.data(), rdim, xpu::REDUCE_MEAN); - PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, - platform::errors::External("XPU kernel error!")); + auto rdims = context.Attr>("dim"); + if (reduce_all) { + rdims.clear(); + for (size_t i = 0; i < xdims.size(); i++) { + rdims.push_back(static_cast(i)); + } + } + int r = xpu::reduce_mean( + dev_ctx.x_context(), reinterpret_cast(input->data()), + reinterpret_cast(output->data()), xdims, rdims); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU reduce_mean kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; + +template +class ReduceMeanGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + XPUType* x_data = + reinterpret_cast(input_grad->mutable_data(ctx.GetPlace())); + const XPUType* dy_data = + reinterpret_cast(output_grad->data()); + + bool reduce_all = ctx.Attr("reduce_all"); + auto reduce_dims = ctx.Attr>("dim"); + + std::vector xdims; + for (int i = 0; i < input->dims().size(); i++) { + xdims.push_back(input->dims()[i]); + } + std::vector ydims; + for (int i = 0; i < output_grad->dims().size(); i++) { + ydims.push_back(output_grad->dims()[i]); + } + + int reduce_numel = 1; + if (reduce_all) { + reduce_dims.clear(); + for (size_t d = 0; d < xdims.size(); ++d) { + reduce_dims.push_back(static_cast(d)); + } + } + for (auto& d : reduce_dims) { + if (d < 0) { + d = d + xdims.size(); + } + reduce_numel *= xdims[d]; + } + + float val = 1.0f / static_cast(reduce_numel); + + auto& dev_ctx = ctx.template device_context(); + + int r = xpu::constant(dev_ctx.x_context(), x_data, input->numel(), + static_cast(val)); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + r = xpu::broadcast_mul(dev_ctx.x_context(), x_data, dy_data, x_data, xdims, + ydims); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU broadcast_mul kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + } // namespace operators } // namespace paddle @@ -54,4 +127,8 @@ REGISTER_OP_XPU_KERNEL( reduce_mean, ops::ReduceMeanXPUKernel); +REGISTER_OP_XPU_KERNEL( + reduce_mean_grad, + ops::ReduceMeanGradXPUKernel); + #endif diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc index 5f98efe8e9146..6ac1027b0ce19 100644 --- a/paddle/fluid/operators/slice_op_xpu.cc +++ b/paddle/fluid/operators/slice_op_xpu.cc @@ -27,6 +27,8 @@ using Tensor = framework::Tensor; template class SliceXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { auto in = ctx.Input("Input"); @@ -83,114 +85,93 @@ class SliceXPUKernel : public framework::OpKernel { } auto& dev_ctx = ctx.template device_context(); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(ctx.GetPlace()); - int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, - starts_extension, ends_extension); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU slice kernel error!")); + const XPUType* in_data = reinterpret_cast(in->data()); + XPUType* out_data = + reinterpret_cast(out->mutable_data(ctx.GetPlace())); + int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, + starts_extension, ends_extension); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU slice kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; template class SliceGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* d_in = ctx.Output(framework::GradVarName("Input")); - d_in->mutable_data(ctx.GetPlace()); - - auto in_dims = d_in->dims(); - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); + auto* input = ctx.Input("Input"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dinput = ctx.Output(framework::GradVarName("Input")); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + std::vector axes(axes_int.begin(), axes_int.end()); + std::vector starts(starts_int.begin(), starts_int.end()); + std::vector ends(ends_int.begin(), ends_int.end()); + + // Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); + } - // prepare starts, ends on XPU - int dim_value = 0, start = 0, end = 0; - // If a negative value is passed for any of the start or end indices, - // it represents number of elements before the end of that dimension. - // If the value passed to start or end is larger than the n - // (the number of elements in this dimension), it represents n. - for (size_t i = 0; i < axes.size(); ++i) { - dim_value = in_dims[axes[i]]; - start = starts[i]; - end = ends[i]; - start = start < 0 ? (start + dim_value) : start; - end = end < 0 ? (end + dim_value) : end; - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim_value); - PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument( - "end should greater than start")); - starts[i] = start; - ends[i] = end; + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); } - size_t shape_size = in_dims.size(); - // the slice XPU kernel require that the length of `start`, `end` must be - // equal - // to the dims size of input tensor, therefore, if shape_size > axes.size(), - // the `starts_extension` and `ends_extension` is necessary. - std::vector starts_extension(shape_size, 0); - std::vector ends_extension(shape_size, 0); - if (shape_size > axes.size()) { - for (size_t i = 0; i < shape_size; ++i) { - ends_extension[i] = in_dims[i]; - } - for (size_t i = 0; i < axes.size(); ++i) { - starts_extension[axes[i]] = starts[i]; - ends_extension[axes[i]] = ends[i]; + + const auto& in_dims = input->dims(); + int rank = in_dims.size(); + + std::vector pad_left(rank); + std::vector out_dims(rank); + std::vector pad_right(rank); + int cnt = 0; + for (int i = 0; i < in_dims.size(); ++i) { + int start = 0; + int end = in_dims[i]; + int axis = cnt < static_cast(axes.size()) ? axes[cnt] : -1; + if (axis == i) { + start = starts[cnt]; + if (start < 0) { + start = (start + in_dims[i]); + } + start = std::max(start, static_cast(0)); + end = ends[cnt]; + if (end < 0) { + end = (end + in_dims[i]); + } + end = std::min(end, static_cast(in_dims[i])); + cnt++; } - } - int* starts_device = nullptr; - int* ends_device = nullptr; - int* starts_host = - shape_size > axes.size() ? starts_extension.data() : starts.data(); - int* ends_host = - shape_size > axes.size() ? ends_extension.data() : ends.data(); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&starts_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&ends_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - starts_device, platform::CPUPlace(), starts_host, - shape_size * sizeof(int)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - ends_device, platform::CPUPlace(), ends_host, - shape_size * sizeof(int)); - // prepare shape on XPU - std::vector shape(shape_size, 0); - for (size_t i = 0; i < shape_size; ++i) { - shape[i] = in_dims[i]; + pad_left[i] = start; + out_dims[i] = end - start; + pad_right[i] = in_dims[i] - out_dims[i] - pad_left[i]; } - int* shape_device = nullptr; - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&shape_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - shape_device, platform::CPUPlace(), shape.data(), - shape_size * sizeof(int)); auto& dev_ctx = ctx.template device_context(); - int r = - xpu::slice_backward(dev_ctx.x_context(), shape_device, starts_device, - ends_device, shape_size, d_out->data(), - d_in->data(), d_in->numel(), d_out->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("xpu slice kernel error")); - dev_ctx.Wait(); - // free device data - xpu_free(shape_device); - xpu_free(starts_device); - xpu_free(ends_device); + const XPUType* dout_data = + reinterpret_cast(dout->data()); + XPUType* din_data = + reinterpret_cast(dinput->mutable_data(ctx.GetPlace())); + int r = xpu::pad(dev_ctx.x_context(), dout_data, din_data, + out_dims, pad_left, pad_right, XPUType(0)); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU pad kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; - } // namespace operators } // namespace paddle @@ -198,8 +179,13 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( slice, ops::SliceXPUKernel, - ops::SliceXPUKernel); + ops::SliceXPUKernel, + ops::SliceXPUKernel); REGISTER_OP_XPU_KERNEL( slice_grad, - ops::SliceGradXPUKernel); + ops::SliceGradXPUKernel, + ops::SliceGradXPUKernel, + ops::SliceGradXPUKernel); #endif diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h index 651243a4dfe66..5d45e5d9d5050 100644 --- a/paddle/fluid/platform/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/xpu/xpu2_op_list.h @@ -109,7 +109,16 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"iou_similarity", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})} + {"arg_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reduce_mean_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, + {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, // AddMore }; diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index c6367911b88f8..156ba07a4ce08 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -313,6 +313,18 @@ def _current_expected_place(): "You are using GPU version Paddle, but your CUDA device is not set properly. CPU device will be used by default." ) _global_expected_place_ = core.CPUPlace() + elif core.is_compiled_with_xpu(): + try: + device_count = core.get_xpu_device_count() + except Exception as e: + device_count = 0 + if device_count > 0: + _global_expected_place_ = core.XPUPlace(0) + else: + warnings.warn( + "You are using XPU version Paddle, but your XPU device is not set properly. CPU device will be used by default." + ) + _global_expected_place_ = core.CPUPlace() else: _global_expected_place_ = core.CPUPlace() diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index f26ee80d0af60..55aaac8dc4852 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -190,9 +190,6 @@ def __init__(self, self.type = "adamw" - if core.is_compiled_with_xpu(): - self.type = "adam" - # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that. self._auxiliary_vars = dict() @@ -259,10 +256,6 @@ def _append_decoupled_weight_decay(self, block, param_and_grad): paddle.fluid.layers.assign(input=scaled_param, output=param) def _append_optimize_op(self, block, param_and_grad): - if paddle.is_compiled_with_xpu(): - self._append_decoupled_weight_decay(block, param_and_grad) - return super(AdamW, self)._append_optimize_op(block, param_and_grad) - assert isinstance(block, framework.Block) if isinstance(param_and_grad, dict): param_and_grad = self._update_param_group(param_and_grad) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 71968d67ed693..72b6bd29fd9e7 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -104,9 +104,9 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): if place is None: place = _current_expected_place() elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, - core.CUDAPlace, core.NPUPlace)): + core.CUDAPlace, core.NPUPlace, core.XPUPlace)): raise ValueError( - "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace" + "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace" ) #Todo(zhouwei): Support allocate tensor on any other specified card From 623e36b0d8869691b5eb05652134310462a641cc Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Mon, 18 Oct 2021 13:46:10 +0800 Subject: [PATCH 009/116] add IPluginV2Layer: AddPluginV2Ext (#36493) --- paddle/fluid/inference/tensorrt/engine.cc | 13 +++++++------ paddle/fluid/inference/tensorrt/engine.h | 6 ++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index d075656d15747..24644645eee49 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -135,12 +135,6 @@ void TensorRTEngine::FreezeNetwork() { } for (int j = 0; j < layer->getNbOutputs(); j++) { auto *temp_out = layer->getOutput(j); - if (temp_out->isNetworkOutput()) { - VLOG(1) << "Layer(Name: " << layer->getName() - << ") is set to float32 because its output(" - << temp_out->getName() << ") is the output of the network."; - return false; - } if (!temp_out->dynamicRangeIsSet()) { VLOG(1) << "Layer(Name: " << layer->getName() << ") is set to float32 because its output(" @@ -357,6 +351,13 @@ nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext( return network()->addPluginV2(inputs, num_inputs, *plugin); } +nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt( + nvinfer1::ITensor *const *inputs, int num_inputs, + nvinfer1::IPluginV2IOExt *plugin) { + owned_plugin_v2ioext_.emplace_back(plugin); + return network()->addPluginV2(inputs, num_inputs, *plugin); +} + void TensorRTEngine::freshDeviceId() { int count; cudaGetDeviceCount(&count); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index e22c2488d3b8b..edf69dc7aa2b5 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -323,6 +323,10 @@ class TensorRTEngine { int num_inputs, plugin::PluginTensorRTV2Ext* plugin); + nvinfer1::IPluginV2Layer* AddPluginV2IOExt(nvinfer1::ITensor* const* inputs, + int num_inputs, + nvinfer1::IPluginV2IOExt* plugin); + void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { quant_dynamic_range_[tensor] = range; } @@ -429,6 +433,7 @@ class TensorRTEngine { bool with_ernie() { return with_ernie_; } bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } bool with_dynamic_shape() { return with_dynamic_shape_; } + AnalysisConfig::Precision precision() { return precision_; } #if IS_TRT_VERSION_GE(6000) nvinfer1::IPluginV2Layer* AddDynamicPlugin( @@ -550,6 +555,7 @@ class TensorRTEngine { std::vector> owned_plugin_; std::vector> owned_plugin_v2ext_; + std::vector> owned_plugin_v2ioext_; // TensorRT related internal members template From 051544b6e8af9cef61ba9870b4ab39af40875ce3 Mon Sep 17 00:00:00 2001 From: ceci3 Date: Mon, 18 Oct 2021 14:19:16 +0800 Subject: [PATCH 010/116] quant support matmul_v2 (#36469) * quant support matmul_v2 * fix format --- .../fluid/contrib/slim/quantization/quantization_pass.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index e89db1fb1da05..dc355fec0d362 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -51,6 +51,7 @@ "depthwise_conv2d", "mul", "matmul", + "matmul_v2", "relu", "leaky_relu", "relu6", @@ -91,6 +92,7 @@ "conv2d_transpose": [["Input", "Filter"], ["Output"]], "mul": [["X", "Y"], ["Out"]], "matmul": [["X", "Y"], ["Out"]], + "matmul_v2": [["X", "Y"], ["Out"]], "pool2d": [["X"], ["Out"]], "elementwise_add": [["X", "Y"], ["Out"]], "concat": [["X"], ["Out"]], @@ -139,7 +141,9 @@ _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose'] -_channelwise_quant_axis1_ops = ['conv2d_transpose', 'mul'] +_channelwise_quant_axis1_ops = [ + 'conv2d_transpose', 'mul', 'matmul', 'matmul_v2' +] def _get_op_input_var_names(op): @@ -1785,7 +1789,8 @@ class AddQuantDequantPass(object): "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6", "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2", - "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm" + "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm", + "matmul_v2" ] # To be compatible with PaddleSlim, not remove _activation_type for now From 3845afff784453547b59a82e926b17d865550051 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 18 Oct 2021 14:50:59 +0800 Subject: [PATCH 011/116] Add operators for async read & async write (#36333) * fix async_read bug * change index place to cpu * add tensor size judge * add async_read & async_write test * fix bug in async_write * fix mac py3 ci * fix bug for cpu version paddle * fix windows ci bug * change input argument error type * change const_cast to mutable_data * add async_write out-of-bound check and consumate error hint * fix a small bug for dst_tensor * add docs and refine codes * refine docs * notest,test=windows_ci * fix windows ci * fix require * fix code-block * add core.is_compiled_with_cuda() --- paddle/fluid/pybind/imperative.cc | 337 +++++++++++++++++++ python/paddle/tests/test_async_read_write.py | 109 ++++++ 2 files changed, 446 insertions(+) create mode 100644 python/paddle/tests/test_async_read_write.py diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 2e22ee90133a8..f94afaa56b8df 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -2249,6 +2249,343 @@ void BindImperative(py::module *m_ptr) { const py::args args, const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); + +#if defined(PADDLE_WITH_CUDA) + m.def( + "async_write", + [](const imperative::VarBase &src, imperative::VarBase &dst, + const imperative::VarBase &offset, const imperative::VarBase &count) { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(src.Place()), true, + platform::errors::InvalidArgument( + "Required `src` device should be CUDAPlace, but received %d. ", + src.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cuda_pinned_place(dst.Place()), true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPinnedPlace, " + "but received %d. ", + dst.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(offset.Place()), true, + platform::errors::InvalidArgument("Required `offset` device should " + "be CPUPlace, but received %d. ", + offset.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(count.Place()), true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d. ", + count.Place())); + + // TODO(daisiming): In future, add index as arguments following + // async_read. + auto &src_tensor = src.Var().Get(); + auto *dst_tensor = dst.MutableVar()->GetMutable(); + auto &offset_tensor = offset.Var().Get(); + auto &count_tensor = count.Var().Get(); + const auto &deviceId = paddle::platform::GetCurrentDeviceId(); + + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`offset` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`count` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + platform::errors::InvalidArgument( + "`offset` and `count` tensor size dismatch.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims().size(), dst_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + } + + auto stream = paddle::platform::stream::get_current_stream(deviceId) + ->raw_stream(); + + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + auto *src_data = src_tensor.data(); + auto *dst_data = dst_tensor->mutable_data(dst.Place()); + const int64_t *offset_data = offset_tensor.data(); + const int64_t *count_data = count_tensor.data(); + int64_t src_offset = 0, dst_offset, c; + for (int64_t i = 0; i < offset_tensor.numel(); i++) { + dst_offset = offset_data[i], c = count_data[i]; + PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index")); + PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index")); + cudaMemcpyAsync( + dst_data + (dst_offset * size), src_data + (src_offset * size), + c * size * sizeof(float), cudaMemcpyDeviceToHost, stream); + src_offset += c; + } + }, + R"DOC( + This api provides a way to write pieces of source tensor to destination tensor + inplacely and asynchronously. In which, we use `offset` and `count` to determine + where to copy. `offset` means the begin points of the copy pieces of `src`, and + `count` means the lengths of the copy pieces of `src`. To be noted, the copy process + will run asynchronously from cuda to pin memory. We can simply remember this as + "gpu async_write to pin_memory". + + Arguments: + + src (Tensor): The source tensor, and the data type should be `float32` currently. + Besides, `src` should be placed on CUDAPlace. + + dst (Tensor): The destination tensor, and the data type should be `float32` currently. + Besides, `dst` should be placed on CUDAPinnedPlace. The shape of `dst` + should be the same with `src` except for the first dimension. + + offset (Tensor): The offset tensor, and the data type should be `int64` currently. + Besides, `offset` should be placed on CPUPlace. The shape of `offset` + should be one-dimensional. + + count (Tensor): The count tensor, and the data type should be `int64` currently. + Besides, `count` should be placed on CPUPlace. The shape of `count` + should be one-dimensinal. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + from paddle.fluid import core + from paddle.device import cuda + + if core.is_compiled_with_cuda(): + src = paddle.rand(shape=[100, 50, 50]) + dst = paddle.emtpy(shape=[200, 50, 50]).pin_memory() + offset = paddle.to_tensor( + np.array([0, 60], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array([40, 60], dtype="int64"), place=paddle.CPUPlace()) + + stream = cuda.Stream() + with cuda.stream_guard(stream): + core.async_write(src, dst, offset, count) + + offset_a = paddle.gather(dst, paddle.to_tensor(np.arange(0, 40))) + offset_b = paddle.gather(dst, paddle.to_tensor(np.arange(60, 120))) + offset_array = paddle.concat([offset_a, offset_b], axis=0) + print(np.allclose(src.numpy(), offset_array.numpy())) # True +)DOC"); + + m.def( + "async_read", + [](const imperative::VarBase &src, imperative::VarBase &dst, + const imperative::VarBase &index, imperative::VarBase &buffer, + const imperative::VarBase &offset, const imperative::VarBase &count) { + PADDLE_ENFORCE_EQ(platform::is_cuda_pinned_place(src.Place()), true, + platform::errors::InvalidArgument( + "Required `src` device should be " + "CUDAPinnedPlace, but received %d.", + src.Place())); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(dst.Place()), true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPlace, but received %d.", + dst.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(index.Place()), true, + platform::errors::InvalidArgument( + "Required `index` device should be CPUPlace, but received %d.", + index.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cuda_pinned_place(buffer.Place()), true, + platform::errors::InvalidArgument( + "Required `buffer` device should be CUDAPinnedPlace, " + "but received %d.", + buffer.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(offset.Place()), true, + platform::errors::InvalidArgument( + "Required `offset` device should be CPUPlace, but received %d.", + offset.Place())); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(count.Place()), true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d.", + count.Place())); + + auto &src_tensor = src.Var().Get(); + auto *dst_tensor = dst.MutableVar()->GetMutable(); + auto &index_tensor = index.Var().Get(); + auto *buffer_tensor = + buffer.MutableVar()->GetMutable(); + auto &offset_tensor = offset.Var().Get(); + auto &count_tensor = count.Var().Get(); + auto *dst_data = dst_tensor->mutable_data(dst.Place()); + const auto &deviceId = paddle::platform::GetCurrentDeviceId(); + + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), dst_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `dst` should have same tensor shape, " + "except for the first dimension.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims().size(), buffer_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `buffer` should have same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], buffer_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `buffer` should have the same tensor shape, " + "except for the first dimension.")); + } + PADDLE_ENFORCE_EQ(index_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`index` tensor should be one-dimensional.")); + + auto stream = paddle::platform::stream::get_current_stream(deviceId) + ->raw_stream(); + + int64_t numel = 0; // total copy length + int64_t copy_flag = offset_tensor.dims()[0]; + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + + if (copy_flag != 0) { + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`offset` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1, + platform::errors::InvalidArgument( + "`count` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(), + platform::errors::InvalidArgument( + "`offset` and `count` tensor size dismatch.")); + auto *offset_data = offset_tensor.data(); + auto *count_data = count_tensor.data(); + for (int64_t i = 0; i < count_tensor.numel(); i++) { + numel += count_data[i]; + } + PADDLE_ENFORCE_LE(numel + index_tensor.numel(), + buffer_tensor->dims()[0], + platform::errors::InvalidArgument( + "Buffer tensor size is too small.")); + PADDLE_ENFORCE_LE(numel + index_tensor.numel(), dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Target tensor size is too small.")); + + int64_t src_offset, dst_offset = 0, c; + auto *src_data = src_tensor.data(); + for (int64_t i = 0; i < offset_tensor.numel(); i++) { + src_offset = offset_data[i], c = count_data[i]; + PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index.")); + PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index.")); + cudaMemcpyAsync( + dst_data + (dst_offset * size), src_data + (src_offset * size), + c * size * sizeof(float), cudaMemcpyHostToDevice, stream); + dst_offset += c; + } + } else { + PADDLE_ENFORCE_LE(index_tensor.numel(), buffer_tensor->dims()[0], + platform::errors::InvalidArgument( + "Buffer tensor size is too small.")); + } + + // Select the index data to the buffer + auto index_select = [](const framework::Tensor &src_tensor, + const framework::Tensor &index_tensor, + framework::Tensor *buffer_tensor) { + auto *src_data = src_tensor.data(); + auto *index_data = index_tensor.data(); + auto *buffer_data = + buffer_tensor->mutable_data(buffer_tensor->place()); + const int &slice_size = src_tensor.numel() / src_tensor.dims()[0]; + const int ©_bytes = slice_size * sizeof(float); + int64_t c = 0; + for (int64_t i = 0; i < index_tensor.numel(); i++) { + std::memcpy(buffer_data + c * slice_size, + src_data + index_data[i] * slice_size, copy_bytes); + c += 1; + } + }; + index_select(src_tensor, index_tensor, buffer_tensor); + + // Copy the data to device memory + cudaMemcpyAsync(dst_data + (numel * size), buffer_tensor->data(), + index_tensor.numel() * size * sizeof(float), + cudaMemcpyHostToDevice, stream); + }, + R"DOC( + This api provides a way to read from pieces of source tensor to destination tensor + asynchronously. In which, we use `index`, `offset` and `count` to determine where + to read. `index` means the index position of src tensor we want to read. `offset` + and count means the begin points and length of pieces of src tensor we want to read. + To be noted, the copy process will run asynchronously from pin memory to cuda place. + We can simply remember this as "cuda async_read from pin_memory". + + Arguments: + + src (Tensor): The source tensor, and the data type should be `float32` currently. + Besides, `src` should be placed on CUDAPinnedPlace. + + dst (Tensor): The destination tensor, and the data type should be `float32` currently. + Besides, `dst` should be placed on CUDAPlace. The shape of `dst` should + be the same with `src` except for the first dimension. + + index (Tensor): The index tensor, and the data type should be `int64` currently. + Besides, `index` should be on CPUplace. The shape of `index` should + be one-dimensional. + + buffer (Tensor): The buffer tensor, used to buffer index copy tensor temporarily. + The data type should be `float32` currently, and should be placed + on CUDAPinnedPlace. The shape of `buffer` should be the same with `src` except for the first dimension. + + offset (Tensor): The offset tensor, and the data type should be `int64` currently. + Besides, `offset` should be placed on CPUPlace. The shape of `offset` + should be one-dimensional. + + count (Tensor): The count tensor, and the data type should be `int64` currently. + Besides, `count` should be placed on CPUPlace. The shape of `count` + should be one-dimensinal. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + from paddle.fluid import core + from paddle.device import cuda + + if core.is_compiled_with_cuda(): + src = paddle.rand(shape=[100, 50, 50], dtype="float32").pin_memory() + dst = paddle.empty(shape=[100, 50, 50], dtype="float32") + offset = paddle.to_tensor( + np.array([0, 60], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array([40, 60], dtype="int64"), place=paddle.CPUPlace()) + buffer = paddle.empty(shape=[50, 50, 50], dtype="float32").pin_memory() + index = paddle.to_tensor( + np.array([1, 3, 5, 7, 9], dtype="int64")).cpu() + + stream = cuda.Stream() + with cuda.stream_guard(stream): + core.async_read(src, dst, index, buffer, offset, count) + +)DOC"); +#endif } } // namespace pybind diff --git a/python/paddle/tests/test_async_read_write.py b/python/paddle/tests/test_async_read_write.py new file mode 100644 index 0000000000000..91875b446aba4 --- /dev/null +++ b/python/paddle/tests/test_async_read_write.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.fluid import core +from paddle.device import cuda + + +class TestAsyncRead(unittest.TestCase): + def setUp(self): + self.empty = paddle.to_tensor( + np.array( + [], dtype="int64"), place=paddle.CPUPlace()) + data = np.random.randn(100, 50, 50).astype("float32") + self.src = paddle.to_tensor(data, place=paddle.CUDAPinnedPlace()) + self.dst = paddle.empty(shape=[100, 50, 50], dtype="float32") + self.index = paddle.to_tensor( + np.array( + [1, 3, 5, 7, 9], dtype="int64")).cpu() + self.buffer = paddle.empty( + shape=[50, 50, 50], dtype="float32").pin_memory() + self.stream = cuda.Stream() + + def test_async_read_empty_offset_and_count(self): + with cuda.stream_guard(self.stream): + core.async_read(self.src, self.dst, self.index, self.buffer, + self.empty, self.empty) + array1 = paddle.gather(self.src, self.index) + array2 = self.dst[:len(self.index)] + + self.assertTrue(np.allclose(array1.numpy(), array2.numpy())) + + def test_async_read_success(self): + offset = paddle.to_tensor( + np.array( + [10, 20], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array( + [5, 10], dtype="int64"), place=paddle.CPUPlace()) + with cuda.stream_guard(self.stream): + core.async_read(self.src, self.dst, self.index, self.buffer, offset, + count) + + # index data + index_array1 = paddle.gather(self.src, self.index) + count_numel = paddle.sum(count).numpy()[0] + index_array2 = self.dst[count_numel:count_numel + len(self.index)] + self.assertTrue(np.allclose(index_array1.numpy(), index_array2.numpy())) + + # offset, count + offset_a = paddle.gather(self.src, paddle.to_tensor(np.arange(10, 15))) + offset_b = paddle.gather(self.src, paddle.to_tensor(np.arange(20, 30))) + offset_array1 = paddle.concat([offset_a, offset_b], axis=0) + offset_array2 = self.dst[:count_numel] + self.assertTrue( + np.allclose(offset_array1.numpy(), offset_array2.numpy())) + + def test_async_read_only_1dim(self): + src = paddle.rand([40], dtype="float32").pin_memory() + dst = paddle.empty([40], dtype="float32") + buffer_ = paddle.empty([20]).pin_memory() + with cuda.stream_guard(self.stream): + core.async_read(src, dst, self.index, buffer_, self.empty, + self.empty) + array1 = paddle.gather(src, self.index) + array2 = dst[:len(self.index)] + self.assertTrue(np.allclose(array1.numpy(), array2.numpy())) + + +class TestAsyncWrite(unittest.TestCase): + def setUp(self): + self.src = paddle.rand(shape=[100, 50, 50, 5], dtype="float32") + self.dst = paddle.empty( + shape=[200, 50, 50, 5], dtype="float32").pin_memory() + self.stream = cuda.Stream() + + def test_async_write_success(self): + offset = paddle.to_tensor( + np.array( + [0, 60], dtype="int64"), place=paddle.CPUPlace()) + count = paddle.to_tensor( + np.array( + [40, 60], dtype="int64"), place=paddle.CPUPlace()) + with cuda.stream_guard(self.stream): + core.async_write(self.src, self.dst, offset, count) + + offset_a = paddle.gather(self.dst, paddle.to_tensor(np.arange(0, 40))) + offset_b = paddle.gather(self.dst, paddle.to_tensor(np.arange(60, 120))) + offset_array = paddle.concat([offset_a, offset_b], axis=0) + self.assertTrue(np.allclose(self.src.numpy(), offset_array.numpy())) + + +if __name__ == "__main__": + if core.is_compiled_with_cuda(): + unittest.main() From 8757fc5b24f0884df57719690d2b0c3fd860d0b6 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 18 Oct 2021 15:09:46 +0800 Subject: [PATCH 012/116] [NPU] fix dtype for arg_max, test=develop (#36457) --- paddle/fluid/operators/arg_max_op_npu.cc | 57 ++++++++----- paddle/fluid/operators/npu_op_runner.cc | 15 ++++ paddle/fluid/operators/npu_op_runner.h | 6 ++ .../unittests/npu/test_arg_max_op_npu.py | 83 ++++++++++++++++--- python/paddle/nn/functional/loss.py | 15 ++-- 5 files changed, 139 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc index 38f9813ad02b4..8b70332c651c8 100644 --- a/paddle/fluid/operators/arg_max_op_npu.cc +++ b/paddle/fluid/operators/arg_max_op_npu.cc @@ -17,30 +17,49 @@ limitations under the Licnse. */ namespace paddle { namespace operators { + using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; -template -class ArgMaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - int64_t axis = ctx.Attr("axis"); - auto dtype = ctx.Attr("dtype"); +template +struct VisitDataArgNPUMaxFunctor { + const framework::ExecutionContext& ctx; - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); + explicit VisitDataArgNPUMaxFunctor(const framework::ExecutionContext& ctx) + : ctx(ctx) {} + template + void apply() const { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.template mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto dtype = ctx.Attr("dtype"); + auto stream = ctx.template device_context().stream(); NpuOpRunner runner; runner.SetType("ArgMaxV2") - .AddInput(*x) + .AddInput(x) .AddInput(std::vector{axis}) - .AddOutput(*out) - .AddAttr("dtype", dtype); + .AddOutput(out) + .AddAttrDataType("dtype", dtype) + .Run(stream); + } +}; - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); +template +class ArgMaxNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dtype = ctx.Attr("dtype"); + if (dtype < 0) { + framework::VisitDataTypeTiny(static_cast( + framework::proto::VarType::INT64), + VisitDataArgNPUMaxFunctor(ctx)); + return; + } + framework::VisitDataTypeTiny( + static_cast(dtype), + VisitDataArgNPUMaxFunctor(ctx)); } }; @@ -48,7 +67,5 @@ class ArgMaxNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - arg_max, ops::ArgMaxNPUKernel, - ops::ArgMaxNPUKernel); +REGISTER_OP_NPU_KERNEL(arg_max, ops::ArgMaxNPUKernel, + ops::ArgMaxNPUKernel); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index d10e94962d6a6..830e18cb8a14c 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -188,6 +188,21 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, return *this; } +NpuOpRunner &NpuOpRunner::AddAttrDataType(const std::string &name, + const NPUAttribute &attr) { + PADDLE_ENFORCE_EQ( + (attr.type() == typeid(int)), true, + platform::errors::InvalidArgument( + "Attr type is NOT equal to framework::proto::VarType::Type.")); + if (!attr_) { + attr_ = aclopCreateAttr(); + } + auto dtype = ConvertToNpuDtype( + static_cast(BOOST_GET_CONST(int, attr))); + PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrDataType(attr_, name.c_str(), dtype)); + return *this; +} + NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) { for (const auto &pair : attrs) { AddAttr(pair.first, pair.second); diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 45e973970a956..6db5f17d67118 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -58,6 +58,12 @@ class NpuOpRunner { NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr); + // NOTE(qili93): need to add indivisual api for aclopSetAttrDataType + // as typeid(aclDataType) and typeid(framework::proto::VarType::Type) + // always go to attr.type() == typeid(int) to call aclopSetAttrInt + NpuOpRunner &AddAttrDataType(const std::string &name, + const NPUAttribute &attr); + NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs); NpuOpRunner &AddInput(const Tensor &tensor); diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py index 9bc46697c0dfc..85ade1179b7d6 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py @@ -1,10 +1,10 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -20,30 +20,31 @@ sys.path.append("..") from op_test import OpTest import paddle +import paddle.fluid as fluid import paddle.fluid.core as core +from paddle.fluid import Program, program_guard paddle.enable_static() class BaseTestCase(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + def initTestCase(self): self.op_type = 'arg_max' - self.dims = (3, 4) + self.dims = (3, 4, 5) self.dtype = 'float32' - self.axis = 1 + self.axis = 0 def setUp(self): + self.set_npu() self.initTestCase() - self.__class__.use_npu = True - self.place = paddle.NPUPlace(0) - np.random.seed(2021) - self.x = (np.random.random(self.dims)).astype(self.dtype) + self.x = (1000 * np.random.random(self.dims)).astype(self.dtype) self.inputs = {'X': self.x} self.attrs = {'axis': self.axis} - if self.op_type == "arg_min": - self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} - else: - self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} def test_check_output(self): self.check_output_with_place(self.place) @@ -211,6 +212,64 @@ def initTestCase(self): self.axis = 0 +class BaseTestComplex1_1(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (4, 5, 6) + self.dtype = 'float32' + self.axis = 2 + + def setUp(self): + self.set_npu() + self.initTestCase() + self.x = (np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = { + 'axis': self.axis, + 'dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = { + 'Out': np.argmax( + self.x, axis=self.axis).astype("int32") + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class BaseTestComplex1_2(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (4, 5, 6) + self.dtype = 'float16' + self.axis = 2 + + def setUp(self): + self.set_npu() + self.initTestCase() + self.x = (np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = { + 'axis': self.axis, + 'dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = { + 'Out': np.argmax( + self.x, axis=self.axis).astype("int32") + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestArgMaxAPI(unittest.TestCase): def initTestCase(self): self.dims = (3, 4, 5) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index b1db45ad50669..adf93b24d3926 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1675,11 +1675,16 @@ def cross_entropy(input, raise ValueError( "Target({}) is out of class_dimension's upper bound({})". format(invalid_label[0], input.shape[axis] - 1)) - - _, out = _C_ops.softmax_with_cross_entropy( - input, label, 'soft_label', soft_label, 'ignore_index', - ignore_index, 'numeric_stable_mode', True, 'axis', axis, - 'use_softmax', use_softmax) + if core.is_compiled_with_npu(): + _, _, out = _C_ops.softmax_with_cross_entropy( + input, label, 'soft_label', soft_label, 'ignore_index', + ignore_index, 'numeric_stable_mode', True, 'axis', axis, + 'use_softmax', use_softmax) + else: + _, out = _C_ops.softmax_with_cross_entropy( + input, label, 'soft_label', soft_label, 'ignore_index', + ignore_index, 'numeric_stable_mode', True, 'axis', axis, + 'use_softmax', use_softmax) if weight is not None: From cbd15f7d00b4e639b2b115d4aee61a8b48faa9ce Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 18 Oct 2021 15:10:07 +0800 Subject: [PATCH 013/116] [NPU] add kernels for elementwise_add gather_nd tile, test=develop (#36464) --- .../elementwise/elementwise_add_op_npu.cc | 3 ++ paddle/fluid/operators/gather_nd_op_npu.cc | 36 +++++++++--------- paddle/fluid/operators/tile_op_npu.cc | 38 +++++++++++-------- .../npu/test_elementwise_add_op_npu.py | 15 +++++--- .../unittests/npu/test_gather_nd_op_npu.py | 16 ++++---- .../tests/unittests/npu/test_tile_op_npu.py | 20 +++++++++- 6 files changed, 80 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index cd1d50a017c36..41d5d718c2420 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -146,6 +146,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseAddNPUKernel, +#endif ops::ElementwiseAddNPUKernel); REGISTER_OP_NPU_KERNEL(elementwise_add_grad, diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc index d04e0bce36fab..8102322bd3b0c 100644 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ b/paddle/fluid/operators/gather_nd_op_npu.cc @@ -18,7 +18,10 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class GatherNdNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -49,14 +52,12 @@ class GatherNdNPUKernel : public framework::OpKernel { framework::proto::VarType::INT64))); const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); + auto stream = ctx.template device_context().stream(); runner.Run(stream); } }; -template +template class GatherNdGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -91,10 +92,7 @@ class GatherNdGradNPUKernel : public framework::OpKernel { dout = &tmp_tensor2; } - auto stream = - ctx.template device_context() - .stream(); - + auto stream = ctx.template device_context().stream(); platform::NPUMemsetAsync(static_cast(p), 0, dx->numel() * sizeof(T), stream); @@ -108,13 +106,13 @@ class GatherNdGradNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - gather_nd, ops::GatherNdNPUKernel, - ops::GatherNdNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gather_nd_grad, - ops::GatherNdGradNPUKernel, - ops::GatherNdGradNPUKernel); +REGISTER_OP_NPU_KERNEL(gather_nd, + ops::GatherNdNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::GatherNdNPUKernel, +#endif + ops::GatherNdNPUKernel); + +REGISTER_OP_NPU_KERNEL(gather_nd_grad, + ops::GatherNdGradNPUKernel, + ops::GatherNdGradNPUKernel); diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index c85a1cbc671af..95d7cb9e362c7 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -16,7 +16,11 @@ limitations under the License. */ namespace paddle { namespace operators { -template + +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class TileNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -92,18 +96,21 @@ class TileNPUKernel : public framework::OpKernel { std::vector temp(repeat_times.size(), 1); if (repeat_times == temp) { - framework::TensorCopy( - *in0, context.GetPlace(), - context.template device_context(), out0); + framework::TensorCopy(*in0, context.GetPlace(), + context.template device_context(), + out0); return; } - const auto& runner = - NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); + // const auto& runner = + // NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); + auto stream = context.template device_context().stream(); + NpuOpRunner runner; + runner.SetType("Tile") + .AddInput(*in0) + .AddInput(std::move(repeat_times)) + .AddOutput(*out0) + .Run(stream); } }; @@ -111,8 +118,9 @@ class TileNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - tile, ops::TileNPUKernel, - ops::TileNPUKernel, - ops::TileNPUKernel); +REGISTER_OP_NPU_KERNEL(tile, ops::TileNPUKernel, ops::TileNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TileNPUKernel, +#endif + ops::TileNPUKernel, + ops::TileNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 9b27e75e37d25..75c70e0a131ac 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -65,7 +65,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -75,7 +75,7 @@ def test_check_grad_normal(self): max_relative_error=0.006, ) def test_check_grad_ingore_x(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -86,7 +86,7 @@ def test_check_grad_ingore_x(self): max_relative_error=0.006, ) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: + if self.dtype == np.float16 or self.dtype == np.int64: return self.check_grad_with_place( @@ -102,6 +102,11 @@ def init_dtype(self): self.dtype = np.float16 +class TestINT64ElementwiseAddOp(TestElementwiseAddOp): + def init_dtype(self): + self.dtype = np.int64 + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseAddOp_scalar(TestElementwiseAddOp): @@ -507,8 +512,8 @@ def gen_data(): def test_dygraph(self): with fluid.dygraph.guard(paddle.NPUPlace(0)): - np_x = np.array([2, 3, 4]).astype('float64') - np_y = np.array([1, 5, 2]).astype('float64') + np_x = np.array([2, 3, 4]).astype('float32') + np_y = np.array([1, 5, 2]).astype('float32') x = fluid.dygraph.to_variable(np_x) y = fluid.dygraph.to_variable(np_y) z = self._executed_api(x, y) diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py index b124a54624171..acb4ffd686fa2 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_nd_op_npu.py @@ -61,7 +61,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -88,7 +88,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -120,7 +120,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place( @@ -153,7 +153,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -184,7 +184,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -217,7 +217,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -252,7 +252,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if typename == "float16": + if typename == "float16" or typename == "int64": self.__class__.no_need_check_grad = True else: self.check_grad_with_place(self.place, ['X'], 'Out') @@ -276,7 +276,7 @@ def test_imperative(self): paddle.enable_static() -for _typename in {'float16', 'float32'}: +for _typename in {'float16', 'float32', 'int64'}: test_class1('gather_nd', _typename) test_class2('gather_nd', _typename) test_class3('gather_nd', _typename) diff --git a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py index 0da80189f7d40..0e61fa00fdf28 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py @@ -206,7 +206,7 @@ def setUp(self): self.op_type = "tile" self.inputs = { 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("int32") + 10, size=(2, 4, 5)).astype("int64") } self.attrs = {'repeat_times': [2, 1, 4]} output = np.tile(self.inputs['X'], (2, 1, 4)) @@ -219,6 +219,24 @@ def test_check_output(self): self.check_output_with_place(self.place) +# Situation 6: input x is Bool +class TestTileOpBool(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.inputs = {'X': np.random.randint(1, size=(2, 4, 5)).astype("bool")} + self.attrs = {'repeat_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + # Test python API class TestTileAPI(unittest.TestCase): def test_api(self): From b7f7664764840d3192de81b5d601f17db10310f2 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Mon, 18 Oct 2021 15:39:47 +0800 Subject: [PATCH 014/116] Add quant axis (#36467) * add_quant_axis * add_quant_axis * --amend * Update quant_conv2d_dequant_fuse_pass.cc --- .../ir/quant_conv2d_dequant_fuse_pass.cc | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 1864899b07e01..22babcc719aeb 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -437,7 +437,11 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, BOOST_GET_CONST(int, quantized_op_node->Op()->GetAttr("bit_length")); int range = ((1 << (bit_length - 1)) - 1); std::vector weight_scale; - + int quant_axis = 0; + if (dequant_op_node->Op()->HasAttr("quant_axis")) { + quant_axis = + BOOST_GET_CONST(int, dequant_op_node->Op()->GetAttr("quant_axis")); + } // Get weight scale if (dequant_type == "fake_channel_wise_dequantize_max_abs") { Node* dequant_channel_scale_node = @@ -488,6 +492,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, } } if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 1, true, + platform::errors::InvalidArgument( + "'quant_axis' of mul/matmul/fc op weight dequantized by " + "[fake_channel_wise_dequantize_max_abs]should be 1, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[1]), platform::errors::InvalidArgument( @@ -511,6 +525,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, "model, please set the 'weight_quantize_type' params as " "'channel_wise_abs_max' and generate the quantized model again.", dequant_type)); + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 0, true, + platform::errors::InvalidArgument( + "'quant_axis' of conv2d/depthwise_conv2d op weight dequantized " + "by [fake_channel_wise_dequantize_max_abs]should be 0, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[0]), platform::errors::InvalidArgument( @@ -528,6 +552,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, "conv2d_transpose must be dequantized by " "[fake_channel_wise_dequantize_max_abs], but got %s", dequant_type)); + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 1, true, + platform::errors::InvalidArgument( + "'quant_axis' of conv2d_transpose op weight dequantized by " + "[fake_channel_wise_dequantize_max_abs]should be 1, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[1]), platform::errors::InvalidArgument( From 4c0ad7727efd5cf9d1d1bac3364f0ae487359e5c Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Mon, 18 Oct 2021 16:10:52 +0800 Subject: [PATCH 015/116] Lml/vhp (#36146) * init functional jacobian api * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * init hessian API * save status * polish API docstring * modify docstring * add utils.py * save status * fix dygraph double grad dtype error when calling for high differential senario * reinvoke ci * test_hessian.py is ok * polish hessian API * init vhp * Revert "init vhp" This reverts commit cbd4d3b66abe82b0ac10721b9eddeb7d82e0a1c8. * init vhp * finish vhp API logically * add test for partial_engine.cc * modify numerical_delta with dtype float32 * merge fix for dtype float64 * spell fix * save status * polish code * rm _stop_gradient_pre_process * save status * add example for vhp interface * add _compute_numerical_vjp and _compute_numerical_vhp * test is ok * vhp is ok * add testVHPFloat64 * modify for comments * modify format * modify format * save status * test_vhp is ok * finish code polish * small modify for v is None Co-authored-by: JiabinYang <360788950@qq.com> --- python/paddle/autograd/__init__.py | 2 +- python/paddle/autograd/functional.py | 112 ++++++++++- python/paddle/autograd/utils.py | 4 +- .../tests/unittests/autograd/CMakeLists.txt | 1 + .../tests/unittests/autograd/test_vhp.py | 182 ++++++++++++++++++ .../fluid/tests/unittests/autograd/utils.py | 26 +++ 6 files changed, 319 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_vhp.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index cffc18e95e5ab..bbfb9f22fc1cb 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -18,6 +18,6 @@ from .py_layer import PyLayer, PyLayerContext # noqa: F401 from ..framework import set_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 -from .functional import vjp, jvp, jacobian, hessian # noqa: F401 +from .functional import vjp, jvp, jacobian, hessian, vhp # noqa: F401 __all__ = ['backward', 'PyLayer', 'PyLayerContext'] diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py index 66ae1562edb68..c6235877f5b2d 100644 --- a/python/paddle/autograd/functional.py +++ b/python/paddle/autograd/functional.py @@ -247,9 +247,9 @@ def func(x): def jacobian(func, inputs, create_graph=False, allow_unused=False): ''' .. note:: - **This API is ONLY available in imperative mode.** + **This API is ONLY available in the imperative mode.** - This API computes the Jacobian matrix of `func` with respect to `inputs`. + This function computes the Jacobian matrix of `func` with respect to `inputs`. Parameters: func (function): a Python function that takes a Tensor or a Tensor @@ -389,9 +389,9 @@ def func(x, y): def hessian(func, inputs, create_graph=False, allow_unused=False): ''' .. note:: - **This API is ONLY available in imperative mode.** + **This API is ONLY available in the imperative mode.** - This API computes the Hessian matrix of `func` with respect to `inputs`. + This function computes the Hessian matrix of `func` with respect to `inputs`. Parameters: func (function): a Python function that takes a Tensor or a Tensor @@ -509,3 +509,107 @@ def jac_func(*ins): return jacobian( jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused) + + +@framework.dygraph_only +def vhp(func, inputs, v=None, create_graph=False, allow_unused=False): + ''' + .. note:: + **This API is ONLY available in the imperative mode.** + + This function computes the product between a vector ``v`` and the + Hessian matrix of `func` with respect to `inputs`. + + Parameters: + func (function): a Python function that takes a Tensor or a Tensor + list/tuple as inputs and returns a Tensor with a single element. + inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or + Tensor list/tuple of the function ``func``. + v (Tensor|list(Tensor)|tuple(Tensor)|None, optional): the vector used + to compute vector hessian product. ``v`` should have same shape + and dtype with ``inputs``. If ``v`` is None, it will be set as + Tensor|list(Tensor) with all elements 1. Defaults to "None". + create_graph (bool, optional): whether to create the gradient graphs + of the computing process. When it is True, higher order derivatives + are supported to compute; when it is False, the gradient graphs of + the computing process would be discarded. Defaults to ``False``. + allow_unused (bool, optional): whether to raise error or return None if + some Tensors of `inputs` are unreachable in the graph. Error would + be raised if allow_unused=False, and None would be returned as + their gradients if allow_unused=True. Default False. + Returns: + output (tuple): tuple with: + func_output (Tensor): output of ``func(inputs)`` + vhp (list(Tensor)): result of the vector hessian product + with the same shape and dtype as the inputs. + Examples 1: + .. code-block:: python + import paddle + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 + vhp_rslt = paddle.autograd.vhp(func, x, v=vx) + print(vhp_rslt) + # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [8.]), + # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[8., 8.], + # [8., 8.]])) + + Examples 2: + .. code-block:: python + import paddle + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + vhp_rslt = paddle.autograd.vhp(func, x) + print(vhp_rslt) + # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [8.]), + # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[4., 4.], + # [4., 4.]])) + + Examples 3: + .. code-block:: python + import paddle + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + y = paddle.ones(shape=[2, 2], dtype='float32') + y.stop_gradient = False + vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 + vy = paddle.ones(shape=[2, 2], dtype='float32') * 3 + vhp_rslt = paddle.autograd.vhp(func, [x, y], v=[vx, vy], allow_unused=True) + print(vhp_rslt) + # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [8.]), + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[8., 8.], + # [8., 8.]]), None]) + ''' + xs = _tensors(inputs, "inputs") + if v is not None: + v = _tensors(v, "v") + + with gradient_scope( + xs, v, create_graph=create_graph, + allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: + outputs = func(*xs) + ys = _tensors(outputs, "outputs") + assert len(ys) == 1 and isinstance( + ys[0], paddle.Tensor + ) and ys[0].shape == [ + 1 + ], "The function to compute vhp should return a Tensor with a single element" + jac = grad_fn(ys, xs, create_graph=True) + vhp = grad_fn(jac, xs, v) + outputs, vhp = return_fn(outputs), return_fn(vhp) + return outputs, vhp diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py index 81fe19c1688c1..710c9ee18dfbf 100644 --- a/python/paddle/autograd/utils.py +++ b/python/paddle/autograd/utils.py @@ -25,9 +25,7 @@ def _tensors(ts, name): name) return list(ts) else: - assert isinstance( - ts, paddle.Tensor - ) or ts is None, "{} must be Tensor or list of Tensor".format(name) + assert isinstance(ts, paddle.Tensor), "{} must be Tensor".format(name) return [ts] diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index 369134c8989a0..30d87e2c9b2b6 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -8,3 +8,4 @@ endforeach(TEST_OP) set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20) set_tests_properties(test_hessian PROPERTIES TIMEOUT 50) +set_tests_properties(test_vhp PROPERTIES TIMEOUT 50) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_vhp.py b/python/paddle/fluid/tests/unittests/autograd/test_vhp.py new file mode 100644 index 0000000000000..09b25203e04a4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_vhp.py @@ -0,0 +1,182 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.compat as cpt +import paddle.nn.functional as F +from utils import _compute_numerical_vhp + + +class TestVHP(unittest.TestCase): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = 1e-2 + self.rtol = 1e-2 + self.atol = 1e-2 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vx = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vy = paddle.rand(shape=self.shape, dtype=self.dtype) + + def test_single_input(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_func_output = func(self.x).numpy() + numerical_vhp = _compute_numerical_vhp( + func, self.x, self.vx, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + + def test_multi_input(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + numerical_func_output = func(self.x, self.y).numpy() + numerical_vhp = _compute_numerical_vhp( + func, [self.x, self.y], [self.vx, self.vy], self.numerical_delta, + self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], + [self.vx, self.vy]) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + for i in range(len(vhp)): + assert np.allclose(vhp[i].numpy(), numerical_vhp[i], self.rtol, + self.atol) + + def test_v_default(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + numerical_func_output = func(self.x, self.y).numpy() + vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype) + vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype) + numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], + [vx, vy], self.numerical_delta, + self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y]) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + for i in range(len(vhp)): + assert np.allclose(vhp[i].numpy(), numerical_vhp[i], self.rtol, + self.atol) + + def test_allow_unused_false(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + try: + self.x.stop_gradient = False + self.y.stop_gradient = False + _ = paddle.autograd.vhp(func, [self.x, self.y]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_allow_unused_true(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + numerical_func_output = func(self.x, self.y).numpy() + numerical_vhp = _compute_numerical_vhp( + func, [self.x, self.y], [self.vx, self.vy], self.numerical_delta, + self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], + [self.vx, self.vy], + allow_unused=True) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + assert vhp[1] is None + + def test_create_graph_false(self): + def func(x): + return paddle.sum(F.sigmoid(x)) + + numerical_func_output = func(self.x).numpy() + numerical_vhp = _compute_numerical_vhp( + func, self.x, self.vx, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert vhp[0].stop_gradient == True + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + try: + paddle.grad(vhp, self.x) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("has no gradient") > 0 + + def test_create_graph_true(self): + def func(x): + return paddle.sum(F.sigmoid(x)) + + numerical_func_output = func(self.x).numpy() + numerical_vhp = _compute_numerical_vhp( + func, self.x, self.vx, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + func_output, vhp = paddle.autograd.vhp(func, + self.x, + self.vx, + create_graph=True) + assert np.allclose(func_output.numpy(), numerical_func_output, + self.rtol, self.atol) + assert vhp[0].stop_gradient == False + assert np.allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, + self.atol) + triple_grad = paddle.grad(vhp, self.x) + assert triple_grad is not None + + +class TestVHPFloat64(TestVHP): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float64' + self.np_dtype = np.float64 + self.numerical_delta = 1e-5 + self.rtol = 1e-5 + self.atol = 1e-5 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vx = paddle.rand(shape=self.shape, dtype=self.dtype) + self.vy = paddle.rand(shape=self.shape, dtype=self.dtype) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py index 3087e932051d8..402e89ae47661 100644 --- a/python/paddle/fluid/tests/unittests/autograd/utils.py +++ b/python/paddle/fluid/tests/unittests/autograd/utils.py @@ -105,3 +105,29 @@ def _compute_numerical_hessian(func, xs, delta, np_dtype): jacobian_pos[0][i][0][p] - jacobian_neg[0][i][0][p] ) / delta / 2. return hessian + + +def _compute_numerical_vjp(func, xs, v, delta, np_dtype): + xs = _tensors(xs, "xs") + jacobian = np.array(_compute_numerical_jacobian(func, xs, delta, np_dtype)) + flat_v = np.array([v_el.numpy().reshape(-1) for v_el in v]) + vjp = [np.zeros((_product(x.shape)), dtype=np_dtype) for x in xs] + for j in range(len(xs)): + for q in range(_product(xs[j].shape)): + vjp[j][q] = np.sum(jacobian[:, j, :, q].reshape(flat_v.shape) * + flat_v) + vjp = [vjp[j].reshape(xs[j].shape) for j in range(len(xs))] + return vjp + + +def _compute_numerical_vhp(func, xs, v, delta, np_dtype): + xs = _tensors(xs, "xs") + hessian = np.array(_compute_numerical_hessian(func, xs, delta, np_dtype)) + flat_v = np.array([v_el.numpy().reshape(-1) for v_el in v]) + vhp = [np.zeros((_product(x.shape)), dtype=np_dtype) for x in xs] + for j in range(len(xs)): + for q in range(_product(xs[j].shape)): + vhp[j][q] = np.sum(hessian[:, j, :, q].reshape(flat_v.shape) * + flat_v) + vhp = [vhp[j].reshape(xs[j].shape) for j in range(len(xs))] + return vhp From bdac9ff6650d30f8b4fe0334e39c0a506757ea67 Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Mon, 18 Oct 2021 12:38:24 +0200 Subject: [PATCH 016/116] Added softplus FP32 FWD OneDNN kernel (#36382) * added softplus * refactored softplus op * deleted unnecessary file * added missing file * added formatting * disabled tests if GPU is used * added reviewer suggestion * unified softplus kernel --- .../operators/mkldnn/activation_mkldnn_op.cc | 13 +++ .../operators/mkldnn/softplus_mkldnn_op.h | 94 +++++++++++++++++++ .../mkldnn/test_softplus_mkldnn_op.py | 78 +++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 603a70458b0ce..29106dc30498e 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { @@ -169,6 +170,13 @@ struct GeluMKLDNNGradFunctor : public BaseActivationFunctor { } }; +template +struct SoftplusMKLDNNFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + custom_softplus_eltwise_forward(ctx); + } +}; + template using ReluMKLDNNFunctor = MKLDNNActivationFunc; @@ -272,3 +280,8 @@ REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradFunctor); + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL( + softplus, MKLDNN, paddle::platform::CPUPlace, + ops::MKLDNNActivationKernel>); diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h new file mode 100644 index 0000000000000..fdb2c534e0363 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class SoftplusMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + SoftplusMKLDNNHandler(const Tensor* x, const float beta, + const mkldnn::engine engine, platform::Place cpu_place) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + auto x_tz = framework::vectorize(x->dims()); + auto x_md = + dnnl::memory::desc(x_tz, platform::MKLDNNGetDataType(), x->format()); + + auto beta_tz = std::vector(x_tz.size(), 1); + auto beta_md = dnnl::memory::desc(beta_tz, platform::MKLDNNGetDataType(), + x->format()); + + dnnl::post_ops post_ops; + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_soft_relu, 0.0f, + 0.0f); + if (beta != 1.0f) { + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, + 1.0f / beta, 0.0f); + } + + dnnl::primitive_attr attrs; + attrs.set_post_ops(post_ops); + + this->AcquireForwardPrimitiveDescriptor(attrs, dnnl::algorithm::binary_mul, + x_md, beta_md, x_md); + } + + std::shared_ptr AcquireBetaMemory(const float* beta) { + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src1_desc(), platform::to_void_cast(beta)); + } +}; + +template +void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) { + const auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + bool is_inplaced = x->IsSharedBufferWith(*out); + + const float beta = ctx.Attr("beta"); + + SoftplusMKLDNNHandler handler(x, beta, mkldnn_engine, ctx.GetPlace()); + + auto src_memory_p = handler.AcquireSrcMemory(x); + + auto beta_memory_p = handler.AcquireBetaMemory(&beta); + auto dst_memory_p = + is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); + auto binary_p = handler.AcquireForwardPrimitive(); + + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_memory_p}, + {DNNL_ARG_SRC_1, *beta_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + binary_p->execute(astream, args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); +} +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py new file mode 100644 index 0000000000000..92699cdbd2709 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py @@ -0,0 +1,78 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.framework import _current_expected_place + + +def ref_softplus(x, beta, threshold): + x_beta = beta * x + out = np.select([x_beta <= threshold, x_beta > threshold], + [np.log(1 + np.exp(x_beta)) / beta, x]) + return out + + +@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)), + "GPU is not supported") +class TestSoftplusOneDNNOp(OpTest): + def setUp(self): + self.op_type = "softplus" + self.beta = 1 + self.threshold = 20 + self.config() + self.attrs = {'use_mkldnn': True, 'beta': self.beta} + self.inputs = {'X': np.random.random(self.x_shape).astype(np.float32)} + self.outputs = { + 'Out': ref_softplus(self.inputs['X'], self.beta, self.threshold) + } + + def config(self): + self.x_shape = (10, 10) + + def test_check_output(self): + self.check_output() + + +class TestSoftplus4DOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (10, 5, 4, 2) + + +class TestSoftplus6DOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (3, 2, 2, 5, 4, 2) + + +class TestSoftplus6DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (3, 5, 2, 5, 4, 2) + self.beta = 2.5 + + +class TestSoftplus3DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp): + def config(self): + self.x_shape = (20, 4, 2) + self.beta = 0.4 + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From 10f0a0f6c8f71436bad715b0f74329e89ea076f9 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Mon, 18 Oct 2021 20:02:20 +0800 Subject: [PATCH 017/116] [HybridParallel]Support fp16 in dygraph hybrid parallel (#36420) * [HybridParallel]Support fp16 in dygraph hybrid parallel * update * update * update for recompute * add unittest of pp+fp16 * add unittest of recompute+fp16 * update * modify ut --- .../distributed/fleet/base/fleet_base.py | 40 ++++- .../fleet/meta_parallel/pipeline_parallel.py | 37 +++-- .../fleet/meta_parallel/pp_utils/utils.py | 13 +- .../distributed/fleet/utils/recompute.py | 15 +- python/paddle/fluid/framework.py | 2 +- .../unittests/hybrid_parallel_pp_fp16.py | 138 ++++++++++++++++++ .../tests/unittests/test_dygraph_recompute.py | 38 ++++- ...test_parallel_dygraph_pipeline_parallel.py | 5 +- 8 files changed, 257 insertions(+), 31 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 544c79a0b3969..571199b99b0d9 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -35,6 +35,8 @@ from ..meta_parallel import PipelineParallel, ShardingParallel from ..meta_optimizers import HybridParallelOptimizer from paddle import _C_ops +from paddle.fluid import core +from paddle.fluid.dygraph import to_variable __all__ = [] @@ -1548,26 +1550,52 @@ def unscale_method(self, optimizer): if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): param_grads = [] + param_grads_fp16 = [] + param_grads_fp32 = [] for group in optimizer._param_groups: for param in group['params']: if param._grad_ivar() is not None: param_grads.append(param._grad_ivar()) + if param._grad_ivar( + ).dtype == core.VarDesc.VarType.FP16: + param_grads_fp16.append(param._grad_ivar()) + else: + param_grads_fp32.append(param._grad_ivar()) else: param_grads = [ param._grad_ivar() for param in optimizer._parameter_list if param._grad_ivar() is not None ] - _C_ops.check_finite_and_unscale(param_grads, self._scale, - param_grads, self._found_inf) - - self._found_inf = paddle.cast(self._found_inf, dtype="int32") + param_grads_fp16 = [ + param._grad_ivar() for param in optimizer._parameter_list + if (param._grad_ivar() is not None) and (param._grad_ivar( + ).dtype == core.VarDesc.VarType.FP16) + ] + param_grads_fp32 = [ + param._grad_ivar() for param in optimizer._parameter_list + if (param._grad_ivar() is not None) and (param._grad_ivar( + ).dtype == core.VarDesc.VarType.FP32) + ] + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + temp_found_inf_fp32) + self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 # TODO(shenliang03) Since dp allreduce in the optimizer is # after the gradscaler, check_finite needs to synchronize global # information. In the future, we should use check_group to speed. paddle.distributed.all_reduce( - self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) - self._found_inf = paddle.cast(self._found_inf, dtype="bool") + paddle.to_tensor( + [self._found_inf], dtype="int32"), + op=paddle.distributed.ReduceOp.MAX, + group=None) # Only tensor_parallel and pipeline_parallel need to modify scaler if self._hcg.get_parallel_mode() in (ParallelMode.TENSOR_PARALLEL, diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 9096097397277..7c7637a90fec0 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -145,9 +145,8 @@ def forward_backward_pipeline(self, data, scaler=None): p2p.send_backward(input_tensor_grad) self._layers.allreduce_shared_weight_gradients() - - train_loss = self._broadcast_final_loss() - + with paddle.amp.auto_cast(enable=False): + train_loss = self._broadcast_final_loss() return train_loss def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): @@ -172,7 +171,8 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): train_loss = self.forward_backward_pipeline(data, scaler) # optimizer - self._optimizer_step() + with paddle.amp.auto_cast(enable=False): + self._optimizer_step() return train_loss @@ -242,12 +242,13 @@ def _forward_step(self, input_tensor): output_tensor, paddle.Tensor ), "Currently, loss_fn should obtain Paddle.Tensor dtype" - if self.accumulate_steps > 1: - output_tensor = output_tensor / self.accumulate_steps + with paddle.amp.auto_cast(enable=False): + if self.accumulate_steps > 1: + output_tensor = output_tensor / self.accumulate_steps - if self.total_loss is None: - self.total_loss = paddle.zeros_like(output_tensor) - self.total_loss += output_tensor.detach() + if self.total_loss is None: + self.total_loss = paddle.zeros_like(output_tensor) + self.total_loss += output_tensor.detach() self.micro_batch_id += 1 return output_tensor @@ -321,13 +322,29 @@ def _broadcast_final_loss(self): if self.is_last_stage: assert self.total_loss is not None, "train_batch() in last stage should obtain vaild loss" loss = self.total_loss.detach() + is_fp32 = paddle.to_tensor( + 1) if loss.dtype == paddle.float32 else paddle.to_tensor(0) + paddle.distributed.broadcast( + is_fp32, + src=self.global_rank, + use_calc_stream=True, + group=self.pp_group) paddle.distributed.broadcast( loss, src=self.global_rank, use_calc_stream=True, group=self.pp_group) else: - loss = paddle.zeros(shape=[1], dtype="float32") + is_fp32 = paddle.to_tensor(1) + paddle.distributed.broadcast( + is_fp32, + src=self._hcg.get_rank_from_stage(self.num_stages - 1), + use_calc_stream=True, + group=self.pp_group) + loss = paddle.zeros( + shape=[1], + dtype="float32") if is_fp32.numpy()[0] else paddle.zeros( + shape=[1], dtype="float16") paddle.distributed.broadcast( loss, src=self._hcg.get_rank_from_stage(self.num_stages - 1), diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py index 08266096548c4..7224ba6dedda0 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py @@ -198,11 +198,14 @@ def forward(ctx, run_function, all_outputs, *args): # TODO support AMP tracer = framework._dygraph_tracer() - if tracer._amp_level == core.AmpLevel.O0: - ctx.is_fw_autocast = False + ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True + if tracer._amp_level == core.AmpLevel.O2: + ctx.amp_level = 'O2' + elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0): + ctx.amp_level = 'O1' else: - ctx.is_fw_autocast = True - ctx.amp_mode = 'O1' + raise ValueError("unsupported amp level: {}".format( + tracer._amp_level)) ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() with paddle.no_grad(): @@ -263,7 +266,7 @@ def backward(ctx, *args): enable=ctx.is_fw_autocast, custom_white_list=ctx.amp_white_list, custom_black_list=ctx.amp_black_list, - level=ctx.amp_mode): + level=ctx.amp_level): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index 56a64049b16e1..2d1db5db945c3 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -98,11 +98,14 @@ def forward(ctx, run_function, preserve_rng_state, *args): # TODO support AMP tracer = framework._dygraph_tracer() - if tracer._amp_level == core.AmpLevel.O0: - ctx.is_fw_autocast = False + ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True + if tracer._amp_level == core.AmpLevel.O2: + ctx.amp_level = 'O2' + elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0): + ctx.amp_level = 'O1' else: - ctx.is_fw_autocast = True - ctx.amp_mode = 'O1' + raise ValueError("unsupported amp level: {}".format( + tracer._amp_level)) ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list() with paddle.no_grad(): @@ -133,7 +136,7 @@ def backward(ctx, *args): enable=ctx.is_fw_autocast, custom_white_list=ctx.amp_white_list, custom_black_list=ctx.amp_black_list, - level=ctx.amp_mode): + level=ctx.amp_level): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) else: @@ -141,7 +144,7 @@ def backward(ctx, *args): enable=ctx.is_fw_autocast, custom_white_list=ctx.amp_white_list, custom_black_list=ctx.amp_black_list, - level=ctx.amp_mode): + level=ctx.amp_level): detached_inputs = detach_variable(tuple(inputs)) outputs = ctx.run_function(*detached_inputs) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 156ba07a4ce08..60e00238f6cc9 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -6097,7 +6097,7 @@ def __init__(self, shape, dtype, **kwargs): self.need_clip = kwargs.get('need_clip', True) - self.is_distributed = False + self.is_distributed = kwargs.get('is_distributed', False) # self.block = default_main_program().global_block() @property diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py new file mode 100644 index 0000000000000..571459365addf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import paddle +import numpy as np +import random +import paddle +import paddle.distributed as dist +import paddle.distributed.fleet as fleet +from hybrid_parallel_pp_layer import AlexNetPipeDesc, AlexNet + + +def set_random_seed(seed, dp_id, rank_id): + """Set random seed for reproducability.""" + random.seed(seed) + np.random.seed(seed + dp_id) + paddle.seed(seed + dp_id) + + +batch_size = 4 +micro_batch_size = 2 + + +class TestDistPPTraning(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + self.model_parallel_size = 1 + self.data_parallel_size = 1 + self.pipeline_parallel_size = 2 + strategy.hybrid_configs = { + "dp_degree": self.data_parallel_size, + "mp_degree": self.model_parallel_size, + "pp_degree": self.pipeline_parallel_size, + } + strategy.pipeline_configs = { + "accumulate_steps": batch_size // micro_batch_size, + "micro_batch_size": micro_batch_size + } + fleet.init(is_collective=True, strategy=strategy) + + def test_pp_model(self): + hcg = fleet.get_hybrid_communicate_group() + word_size = hcg.get_model_parallel_world_size() + dp_id = hcg.get_data_parallel_rank() + pp_id = hcg.get_stage_id() + rank_id = dist.get_rank() + set_random_seed(1024, dp_id, rank_id) + + #construct model a + model_a = AlexNet(10) + scheduler_a = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, + parameters=model_a.parameters()) + + scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5) + + # construct model b + model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size) + scheduler_b = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, + parameters=model_b.parameters()) + + param_len = len(model_a.parameters()) + parameters = [] + for param in model_a.parameters(): + parameters.append(param.numpy()) + + for idx, param in enumerate(model_b.parameters()): + param.set_value(parameters[idx + pp_id * (param_len // 2)]) + + model_a, optimizer_a = paddle.amp.decorate( + models=model_a, + optimizers=optimizer_a, + level='O2', + save_dtype='float32') + model_b, optimizer_b = paddle.amp.decorate( + models=model_b, + optimizers=optimizer_b, + level='O2', + save_dtype='float32') + + model_b = fleet.distributed_model(model_b) + optimizer_b = fleet.distributed_optimizer(optimizer_b) + scaler_b = paddle.amp.GradScaler(init_loss_scaling=2**5) + scaler_b = fleet.distributed_scaler(scaler_b) + + # construct reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True) + + for step_id, data in enumerate(train_reader()): + x_data = np.array([x[0] for x in data]).astype('float32').reshape( + batch_size, 1, 28, 28) + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + batch_size, 1) + img = paddle.to_tensor(x_data) + label = paddle.to_tensor(y_data) + img.stop_gradient = True + label.stop_gradient = True + + if step_id >= 5: + return True + + with paddle.amp.auto_cast(enable=True, level='O2'): + loss_a = model_a(img, label) + scaler_a.scale(loss_a).backward() + with paddle.amp.auto_cast(enable=False): + scaler_a.minimize(optimizer_a, loss_a) + optimizer_a.clear_grad() + scheduler_a.step() + + loss_b = model_b.train_batch( + [img, label], optimizer_b, scheduler_b, scaler=scaler_b) + + print("loss: ", loss_a.numpy(), loss_b.numpy()) + np.testing.assert_allclose( + loss_a.numpy(), loss_b.numpy(), rtol=5e-3) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py index 332603b812955..4a4bcd2b8163c 100755 --- a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py @@ -92,7 +92,10 @@ def forward(self, inputs): return inputs -def run_model(recompute_block=[], recompute_kwargs={}, enable_autocast=False): +def run_model(recompute_block=[], + recompute_kwargs={}, + enable_autocast=False, + pure_fp16=False): gen = paddle.seed(10) gen.manual_seed(10) np.random.seed(10) @@ -118,7 +121,8 @@ def run_model(recompute_block=[], recompute_kwargs={}, enable_autocast=False): x_data = np.random.randn(batch_size, input_size).astype(np.float32) x = paddle.to_tensor(x_data) # x.stop_gradient = False - with paddle.amp.auto_cast(True): + level = 'O2' if pure_fp16 else 'O1' + with paddle.amp.auto_cast(True, level=level): y_pred = model(x) loss = y_pred.mean() if enable_autocast: @@ -196,6 +200,36 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad): recompute_block=[1, 3], enable_autocast=True) check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + def test_fc_net_with_fp16(self): + def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad): + self.assertEqual(loss_ref, loss) + self.assertEqual(param_ref, param) + self.assertEqual(grad_ref, grad) + + # without recompute + loss_ref, param_ref, grad_ref = run_model( + recompute_block=[], enable_autocast=True, pure_fp16=True) + + # recompute second block + loss, param, grad = run_model( + recompute_block=[1], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute fourth block + loss, param, grad = run_model( + recompute_block=[3], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute second to fourth block + loss, param, grad = run_model( + recompute_block=[1, 2, 3], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + + # recompute second & fourth block + loss, param, grad = run_model( + recompute_block=[1, 3], enable_autocast=True, pure_fp16=True) + check_identical(loss_ref, param_ref, grad_ref, loss, param, grad) + def test_recompute_kwargs(self): paddle.set_device("gpu") kwargs = {"is_test": False} diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py index f54aa1bb6e556..71c254dabb9e1 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py @@ -30,9 +30,12 @@ def test_hybrid_parallel_pp_tuple_inputs(self): def test_hybrid_parallel_shared_weight(self): self.run_mnist_2gpu('hybrid_parallel_shared_weight.py') - def test_pipeline_parallel(self): + def test_pipeline_parallel_amp(self): self.run_mnist_2gpu('hybrid_parallel_pp_amp.py') + def test_pipeline_parallel_fp16(self): + self.run_mnist_2gpu('hybrid_parallel_pp_fp16.py') + def test_hybrid_parallel_transformer(self): self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py') From 305b99a0c1be76ed33490231d41cba2057b57eaa Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 19 Oct 2021 10:30:42 +0800 Subject: [PATCH 018/116] Add pow2_decay_with_linear_warmup op (#36421) * add pow2_warmup op * remove contrib __all__ * add AttrT * rename * follow comments * fix duplicate PADDLE_RESTRICT --- .../pow2_decay_with_linear_warmup_op.cc | 90 +++++++++++++ .../pow2_decay_with_linear_warmup_op.cu | 24 ++++ .../pow2_decay_with_linear_warmup_op.h | 119 ++++++++++++++++++ python/paddle/fluid/contrib/layers/nn.py | 36 ++++++ .../test_pow2_decay_with_linear_warmup_op.py | 90 +++++++++++++ 5 files changed, 359 insertions(+) create mode 100644 paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc create mode 100644 paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu create mode 100644 paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc new file mode 100644 index 0000000000000..12362b1bc6401 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +class Pow2DecayWithLinearWarmupOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + auto dim = framework::make_ddim({1}); + ctx->SetOutputDim("LearningRateOut", dim); + ctx->SetOutputDim("StepOut", dim); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto data_type = + OperatorWithKernel::IndicateVarDataType(ctx, "LearningRate"); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class Pow2DecayWithLinearWarmupOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("LearningRate", "(Tensor) The input learning rate Tensor."); + AddInput("Step", "(Tensor) The input global step Tensor."); + AddOutput("LearningRateOut", + "(Tensor) The output learning rate Tensor. Same with " + "Input(LearningRate)."); + AddOutput( + "StepOut", + "(Tensor) The output learning rate Tensor. Same with Input(Step)."); + AddAttr("warmup_steps", "(int64_t) The warmup steps."); + AddAttr( + "total_steps", + "(int64_t) The total steps for changing the learning rate."); + AddAttr("start_lr", + "(float) The initial value of the learning rate."); + AddAttr("base_lr", + "(float) The final learning rate value after warmup."); + AddAttr("end_lr", + "(float) The final learning rate value after total_steps."); + AddComment(R"DOC( +The Pow2DecayWithLinearWarmup learning rate scheduler. + +When step_num < warmup_steps, lr = (base_lr - start_lr) * step_num / warmup_steps + start_lr + +When warmup_steps <= step_num <= total_steps, + factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps) + lr = (base_lr - end_lr) * factor * factor + end_lr + +When step_num > total_steps, lr = end_lr + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOp, + ops::Pow2DecayWithLinearWarmupOpMaker); +REGISTER_OP_CPU_KERNEL( + pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu new file mode 100644 index 0000000000000..6695778dbac06 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h new file mode 100644 index 0000000000000..41e07b0343e72 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h @@ -0,0 +1,119 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { + +template +struct Pow2DecayWithLinearWarmupFunctor { + template + using RestrictPtr = U *PADDLE_RESTRICT; + + public: + HOSTDEVICE Pow2DecayWithLinearWarmupFunctor( + RestrictPtr lr, RestrictPtr step, size_t warmup_steps, + size_t total_steps, AttrT start_lr, AttrT base_lr, AttrT end_lr) + : lr_(lr), + step_(step), + warmup_steps_(warmup_steps), + total_steps_(total_steps), + start_lr_(start_lr), + base_lr_(base_lr), + end_lr_(end_lr) {} + + HOSTDEVICE void operator()(size_t) const { + size_t step = static_cast(*step_); + *step_ = static_cast(step + 1); + if (step < warmup_steps_) { + auto new_lr = + static_cast(base_lr_ - start_lr_) * step / warmup_steps_ + + start_lr_; + *lr_ = static_cast(new_lr); + } else if (step < total_steps_) { + auto factor = 1 - + static_cast(step - warmup_steps_) / + (total_steps_ - warmup_steps_); + auto new_lr = + static_cast(base_lr_ - end_lr_) * factor * factor + end_lr_; + *lr_ = static_cast(new_lr); + } else { + *lr_ = static_cast(end_lr_); + } + } + + private: + RestrictPtr lr_; + RestrictPtr step_; + size_t warmup_steps_; + size_t total_steps_; + AttrT start_lr_; + AttrT base_lr_; + AttrT end_lr_; +}; + +template +class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const { + const auto *lr = ctx.Input("LearningRate"); + const auto *step = ctx.Input("Step"); + auto *lr_out = ctx.Output("LearningRateOut"); + auto *step_out = ctx.Output("StepOut"); + PADDLE_ENFORCE_EQ( + lr, lr_out, platform::errors::InvalidArgument("Input(LearningRate) and " + "Output(LearningRateOut) " + "must be the same.")); + PADDLE_ENFORCE_NOT_NULL(lr, + platform::errors::InvalidArgument( + "Input(LearingRate) should not be nullptr.")); + PADDLE_ENFORCE_EQ(step, step_out, + platform::errors::InvalidArgument( + "Input(Step) and Output(StepOut) must be the same.")); + PADDLE_ENFORCE_NOT_NULL(step, platform::errors::InvalidArgument( + "Input(Step) should not be nullptr.")); + PADDLE_ENFORCE_EQ( + step->IsInitialized(), true, + platform::errors::InvalidArgument("Input(Step) must be initialized.")); + + auto warmup_steps = static_cast(ctx.Attr("warmup_steps")); + auto total_steps = static_cast(ctx.Attr("total_steps")); + PADDLE_ENFORCE_LE(warmup_steps, total_steps, + platform::errors::InvalidArgument( + "warmup_steps must not be larger than total_steps.")); + auto start_lr = ctx.Attr("start_lr"); + auto base_lr = ctx.Attr("base_lr"); + auto end_lr = ctx.Attr("end_lr"); + + auto *lr_data = lr_out->data(); + auto *step_data = step_out->data(); + auto &dev_ctx = ctx.template device_context(); + platform::ForRange for_range(dev_ctx, 1); + using AttrT = float; + Pow2DecayWithLinearWarmupFunctor functor( + lr_data, step_data, warmup_steps, total_steps, + static_cast(start_lr), static_cast(base_lr), + static_cast(end_lr)); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 99ede353c1081..0d0addb17e9ae 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -1932,3 +1932,39 @@ def build_program(main_program, startup_program): attrs=attrs) return batch_norm_out + + +def pow2_decay_with_linear_warmup(warmup_steps, + total_steps, + start_lr, + base_lr, + end_lr, + dtype='float32', + name=None): + if paddle.fluid.in_dygraph_mode(): + raise NotImplementedError( + "pow2_warmup does not support dygraph mode yet.") + + helper = LayerHelper("pow2_decay_with_linear_warmup", **locals()) + lr = helper.create_global_variable(persistable=True, dtype=dtype, shape=[1]) + helper.set_variable_initializer(lr, Constant(value=start_lr)) + + step = helper.create_global_variable( + persistable=True, dtype='int64', shape=[1]) + helper.set_variable_initializer(step, Constant(value=0)) + assert warmup_steps <= total_steps, "warmup_steps cannot be larger than total_steps" + + helper.append_op( + type="pow2_decay_with_linear_warmup", + inputs={"LearningRate": lr, + "Step": step}, + outputs={"LearningRateOut": lr, + "StepOut": step}, + attrs={ + "warmup_steps": warmup_steps, + "total_steps": total_steps, + "start_lr": start_lr, + "base_lr": base_lr, + "end_lr": end_lr, + }) + return lr diff --git a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py new file mode 100644 index 0000000000000..641ea3eccf8d2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py @@ -0,0 +1,90 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.fluid.contrib.layers.nn import pow2_decay_with_linear_warmup +from paddle.optimizer.lr import LinearWarmup +from paddle.optimizer.lr import PolynomialDecay +import unittest + + +def gen_pow2_warmup_op_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, + place): + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + lr = pow2_decay_with_linear_warmup(warmup_steps, total_steps, start_lr, + base_lr, end_lr) + exe = paddle.static.Executor(place) + with paddle.static.scope_guard(paddle.static.Scope()): + exe.run(startup) + while True: + lr_np = exe.run(main, fetch_list=[lr])[0] + yield lr_np[0] + + +class Pow2Warmup(LinearWarmup): + def __init__(self, warmup_steps, total_steps, start_lr, base_lr, end_lr): + assert total_steps > warmup_steps + lr_sch = PolynomialDecay( + learning_rate=base_lr, + decay_steps=total_steps - warmup_steps, + end_lr=end_lr, + power=2) + + super(Pow2Warmup, self).__init__( + learning_rate=lr_sch, + warmup_steps=warmup_steps, + start_lr=start_lr, + end_lr=base_lr) + + +def gen_pow2_warmup_py_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, + place): + lr_sch = Pow2Warmup(warmup_steps, total_steps, start_lr, base_lr, end_lr) + while True: + yield lr_sch() + lr_sch.step() + + +class TestPow2WarmupLRScheduler(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.params = { + 'warmup_steps': 30, + 'total_steps': 100, + 'start_lr': 0.01, + 'base_lr': 0.02, + 'end_lr': 0.001, + } + self.step_num = 1000 + + def check_with_place(self, place): + kwargs = dict(self.params) + kwargs['place'] = place + lr_sch_op = gen_pow2_warmup_op_lr(**kwargs) + lr_sch_py = gen_pow2_warmup_py_lr(**kwargs) + for i, (lr_op, lr_py) in enumerate(zip(lr_sch_op, lr_sch_py)): + self.assertLess(abs(lr_op - lr_py), 1e-6) + if i > self.step_num: + break + + def test_main(self): + self.check_with_place(paddle.CPUPlace()) + if paddle.is_compiled_with_cuda(): + self.check_with_place(paddle.CUDAPlace(0)) + + +if __name__ == "__main__": + unittest.main() From a7830a293224c21742c892aadefe9971e498952e Mon Sep 17 00:00:00 2001 From: zmx Date: Tue, 19 Oct 2021 10:37:42 +0800 Subject: [PATCH 019/116] bug fix for DeserializeSelectedRows. test=develop (#36520) --- paddle/fluid/distributed/service/brpc_utils.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index a356b77e73733..376e820cb7a74 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -273,8 +273,8 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, auto* slr = var->GetMutable(); framework::Tensor* tensor = slr->mutable_value(); slr->set_height(msg.slr_height()); - std::vector tmp_rows(msg.slr_height()); - memcpy(&tmp_rows[0], msg.data().data(), msg.slr_height() * sizeof(int64_t)); + std::vector tmp_rows(msg.dims()[0]); + memcpy(tmp_rows.data(), msg.data().data(), msg.dims()[0] * sizeof(int64_t)); slr->set_rows(tmp_rows); std::vector vec_dim; for (auto& x : msg.dims()) { From 77f4597f81b075e01d98bcde0a25d03e5a390366 Mon Sep 17 00:00:00 2001 From: xiaoting <31891223+tink2123@users.noreply.github.com> Date: Tue, 19 Oct 2021 10:56:15 +0800 Subject: [PATCH 020/116] fix out of range for area interp, test=develop (#36466) --- python/paddle/nn/functional/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index fdd370d7f81e7..7362b284eaefe 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -296,7 +296,8 @@ def interpolate(x, ) if resample == 'AREA': - if isinstance(size, list) or isinstance(size, tuple): + if isinstance(size, list) or isinstance(size, tuple) or isinstance( + size, Variable): if len(size) == 0: raise ValueError("output size can not be empty") if len(x.shape) == 3: From 1d5746bd022c1c7bc3e35eb727559f30baaf3b0f Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Tue, 19 Oct 2021 13:13:16 +0800 Subject: [PATCH 021/116] add rocm support for fft api (#36415) --- paddle/fluid/operators/CMakeLists.txt | 3 +- paddle/fluid/operators/spectral_helper.h | 261 ++++++++ paddle/fluid/operators/spectral_op.cu | 614 +++++++----------- paddle/fluid/platform/dynload/CMakeLists.txt | 2 +- .../fluid/platform/dynload/dynamic_loader.cc | 10 + .../fluid/platform/dynload/dynamic_loader.h | 1 + paddle/fluid/platform/dynload/hipfft.cc | 30 + paddle/fluid/platform/dynload/hipfft.h | 124 ++++ paddle/fluid/platform/enforce.h | 10 + paddle/fluid/platform/enforce_test.cc | 4 + 10 files changed, 679 insertions(+), 380 deletions(-) create mode 100644 paddle/fluid/operators/spectral_helper.h create mode 100644 paddle/fluid/platform/dynload/hipfft.cc create mode 100644 paddle/fluid/platform/dynload/hipfft.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index bb31fcf854d88..78cbc7e8a583b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -102,8 +102,7 @@ else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() - -if (WITH_GPU AND (NOT WITH_ROCM)) +if (WITH_GPU OR WITH_ROCM) if (MKL_FOUND AND WITH_ONEMKL) op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS}) target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h new file mode 100644 index 0000000000000..9c34d500eac92 --- /dev/null +++ b/paddle/fluid/operators/spectral_helper.h @@ -0,0 +1,261 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/spectral_op.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hipfft.h" +#endif + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/cufft.h" +#endif + +namespace paddle { +namespace operators { +using ScalarType = framework::proto::VarType::Type; +const int64_t kMaxCUFFTNdim = 3; +const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1; +// This struct is used to easily compute hashes of the +// parameters. It will be the **key** to the plan cache. +struct PlanKey { + // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3 + int64_t signal_ndim_; + // These include additional batch dimension as well. + int64_t sizes_[kMaxDataNdim]; + int64_t input_shape_[kMaxDataNdim]; + int64_t output_shape_[kMaxDataNdim]; + FFTTransformType fft_type_; + ScalarType value_type_; + + PlanKey() = default; + + PlanKey(const std::vector& in_shape, + const std::vector& out_shape, + const std::vector& signal_size, FFTTransformType fft_type, + ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_size.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); + std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); + std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); + } +}; + +#if defined(PADDLE_WITH_CUDA) +// An RAII encapsulation of cuFFTHandle +class CuFFTHandle { + ::cufftHandle handle_; + + public: + CuFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_)); + } + + ::cufftHandle& get() { return handle_; } + const ::cufftHandle& get() const { return handle_; } + + ~CuFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_)); + } +}; + +using plan_size_type = long long int; // NOLINT +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class CuFFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit CuFFTConfig(const PlanKey& plan_key) + : CuFFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + CuFFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + cudaDataType itype, otype, exec_type; + const auto complex_input = has_complex_input(fft_type); + const auto complex_output = has_complex_output(fft_type); + if (dtype == framework::proto::VarType::FP32) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == framework::proto::VarType::FP64) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == framework::proto::VarType::FP16) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "cuFFT only support transforms of type float16, float32 and " + "float64")); + } + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + + ws_size = ws_size_t; + } + + const cufftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + CuFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; + +#elif defined(PADDLE_WITH_HIP) +// An RAII encapsulation of cuFFTHandle +class HIPFFTHandle { + ::hipfftHandle handle_; + + public: + HIPFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_)); + } + + ::hipfftHandle& get() { return handle_; } + const ::hipfftHandle& get() const { return handle_; } + + ~HIPFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_)); + } +}; +using plan_size_type = int; +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class HIPFFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit HIPFFTConfig(const PlanKey& plan_key) + : HIPFFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + HIPFFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + hipfftType exec_type = [&] { + if (dtype == framework::proto::VarType::FP32) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_C2C; + case FFTTransformType::R2C: + return HIPFFT_R2C; + case FFTTransformType::C2R: + return HIPFFT_C2R; + } + } else if (dtype == framework::proto::VarType::FP64) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_Z2Z; + case FFTTransformType::R2C: + return HIPFFT_D2Z; + case FFTTransformType::C2R: + return HIPFFT_Z2D; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "hipFFT only support transforms of type float32 and float64")); + }(); + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, + batch, &ws_size_t)); + + ws_size = ws_size_t; + } + + const hipfftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + HIPFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index 24dffaad41b5f..e8a4fac2915d7 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -8,10 +8,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include -#include - #include #include #include @@ -24,311 +20,246 @@ #include #include "paddle/fluid/operators/conj_op.h" +#include "paddle/fluid/operators/spectral_helper.h" #include "paddle/fluid/operators/spectral_op.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/dynload/cufft.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { namespace { -using ScalarType = framework::proto::VarType::Type; -const int64_t kMaxCUFFTNdim = 3; -const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1; - -static inline std::string get_cufft_error_info(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; -#ifndef __HIPCC__ - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; -#endif - case CUFFT_NOT_SUPPORTED: - return "CUFFT_NOT_SUPPORTED"; - default: - std::ostringstream ss; - ss << "unknown error " << error; - return ss.str(); +// Calculates the normalization constant +double fft_normalization_scale(FFTNormMode normalization, + const std::vector& sizes, + const std::vector& dims) { + // auto norm = static_cast(normalization); + if (normalization == FFTNormMode::none) { + return static_cast(1.0); } -} -static inline void CUFFT_CHECK(cufftResult error) { - PADDLE_ENFORCE_CUDA_SUCCESS(error); + int64_t signal_numel = 1; + for (auto dim : dims) { + signal_numel *= sizes[dim]; + } + const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) + ? std::sqrt(signal_numel) + : static_cast(signal_numel); + return static_cast(1.0 / scale_denom); } -// This struct is used to easily compute hashes of the -// parameters. It will be the **key** to the plan cache. -struct PlanKey { - // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3 - int64_t signal_ndim_; - // These include additional batch dimension as well. - int64_t sizes_[kMaxDataNdim]; - int64_t input_shape_[kMaxDataNdim]; - int64_t output_shape_[kMaxDataNdim]; - FFTTransformType fft_type_; - ScalarType value_type_; - - PlanKey() = default; - - PlanKey(const std::vector& in_shape, - const std::vector& out_shape, - const std::vector& signal_size, FFTTransformType fft_type, - ScalarType value_type) { - // Padding bits must be zeroed for hashing - memset(this, 0, sizeof(*this)); - signal_ndim_ = signal_size.size() - 1; - fft_type_ = fft_type; - value_type_ = value_type; - - std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); - std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); - std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); +template +void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, + FFTNormMode normalization, + const std::vector& sizes, + const std::vector& axes) { + double scale = fft_normalization_scale(normalization, sizes, axes); + if (scale != 1.0) { + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto dev = ctx.eigen_device(); + EigenScale::Eval(*dev, eigen_out, eigen_in, + static_cast(scale), + static_cast(0), false); + } else { + framework::TensorCopy(*in, ctx.GetPlace(), out); } -}; - -// An RAII encapsulation of cuFFTHandle -class CuFFTHandle { - ::cufftHandle handle_; - - public: - CuFFTHandle() { CUFFT_CHECK(platform::dynload::cufftCreate(&handle_)); } +} - ::cufftHandle& get() { return handle_; } - const ::cufftHandle& get() const { return handle_; } +#if defined(PADDLE_WITH_CUDA) +CuFFTConfig create_cufft_config(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = framework::IsComplexType(input.type()) + ? framework::ToRealType(input.type()) + : input.type(); + auto fft_type = GetFFTTransformType(input.type(), output.type()); + // signal sizes + std::vector signal_size(signal_ndim + 1); - ~CuFFTHandle() { -// Not using fftDestroy() for rocFFT to work around double freeing of handles -#ifndef __HIPCC__ - CUFFT_CHECK(platform::dynload::cufftDestroy(handle_)); -#endif + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); } -}; + PlanKey key(framework::vectorize(input.dims()), + framework::vectorize(output.dims()), signal_size, fft_type, + value_type); -#ifdef __HIPCC__ -using plan_size_type = int; -#else -using plan_size_type = long long int; // NOLINT -#endif + return CuFFTConfig(key); +} -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class CuFFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - CuFFTConfig(const CuFFTConfig&) = delete; - CuFFTConfig& operator=(CuFFTConfig const&) = delete; - - explicit CuFFTConfig(const PlanKey& plan_key) - : CuFFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - CuFFTConfig(const std::vector& sizes, const int64_t signal_ndim, - FFTTransformType fft_type, ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, sizes.size() - 1)); - -#ifdef __HIPCC__ - hipfftType exec_type = [&] { - if (dtype == framework::proto::VarType::FP32) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_C2C; - case FFTTransformType::R2C: - return HIPFFT_R2C; - case FFTTransformType::C2R: - return HIPFFT_C2R; - } - } else if (dtype == framework::proto::VarType::FP64) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_Z2Z; - case FFTTransformType::R2C: - return HIPFFT_D2Z; - case FFTTransformType::C2R: - return HIPFFT_Z2D; - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); - }(); -#else - cudaDataType itype, otype, exec_type; - const auto complex_input = has_complex_input(fft_type); - const auto complex_output = has_complex_output(fft_type); - if (dtype == framework::proto::VarType::FP32) { - itype = complex_input ? CUDA_C_32F : CUDA_R_32F; - otype = complex_output ? CUDA_C_32F : CUDA_R_32F; - exec_type = CUDA_C_32F; - } else if (dtype == framework::proto::VarType::FP64) { - itype = complex_input ? CUDA_C_64F : CUDA_R_64F; - otype = complex_output ? CUDA_C_64F : CUDA_R_64F; - exec_type = CUDA_C_64F; - } else if (dtype == framework::proto::VarType::FP16) { - itype = complex_input ? CUDA_C_16F : CUDA_R_16F; - otype = complex_output ? CUDA_C_16F : CUDA_R_16F; - exec_type = CUDA_C_16F; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "cuFFT only support transforms of type float16, float32 and " - "float64")); - } -#endif +// Execute a pre-planned transform +static void exec_cufft_plan_raw(const CuFFTConfig& config, void* in_data, + void* out_data, bool forward) { + auto& plan = config.plan(); - // disable auto allocation of workspace to use allocator from the framework - CUFFT_CHECK(platform::dynload::cufftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - -// make plan -#ifdef __HIPCC__ - CUFFT_CHECK(hipfftMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, - batch, &ws_size_t)); -#else - - CUFFT_CHECK(platform::dynload::cufftXtMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, - batch, &ws_size_t, exec_type)); -#endif + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtExec( + plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); +} - ws_size = ws_size_t; +template +void exec_cufft_plan(const DeviceContext& ctx, const CuFFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + // execute transform plan + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + math::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_cufft_plan_raw(config, input_conj.data(), output->data(), + forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_cufft_plan_raw(config, input->data(), out_conj.data(), + forward); + + platform::ForRange for_range(ctx, output->numel()); + math::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_cufft_plan_raw(config, input->data(), output->data(), + forward); } +} - const cufftHandle& plan() const { return plan_ptr.get(); } +#elif defined(PADDLE_WITH_HIP) - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } +HIPFFTConfig create_hipfft_config(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = framework::IsComplexType(input.type()) + ? framework::ToRealType(input.type()) + : input.type(); + auto fft_type = GetFFTTransformType(input.type(), output.type()); + // signal sizes + std::vector signal_size(signal_ndim + 1); - private: - CuFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); + } + PlanKey key(framework::vectorize(input.dims()), + framework::vectorize(output.dims()), signal_size, fft_type, + value_type); + + return HIPFFTConfig(key); +} // Execute a pre-planned transform -static void exec_cufft_plan(const CuFFTConfig& config, void* in_data, - void* out_data, bool forward) { +static void exec_hipfft_plan_raw(const HIPFFTConfig& config, void* in_data, + void* out_data, bool forward) { auto& plan = config.plan(); -#ifdef __HIPCC__ + auto value_type = config.data_type(); if (value_type == framework::proto::VarType::FP32) { switch (config.transform_type()) { case FFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecC2C(plan, static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2C( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecR2C(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecR2C( + plan, static_cast(in_data), + static_cast(out_data))); return; } case FFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecC2R(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2R( + plan, static_cast(in_data), + static_cast(out_data))); return; } } } else if (value_type == framework::proto::VarType::FP64) { switch (config.transform_type()) { case FFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecZ2Z(plan, - static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2Z( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecD2Z(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecD2Z( + plan, static_cast(in_data), + static_cast(out_data))); return; } case FFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecZ2D(plan, - static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2D( + plan, static_cast(in_data), + static_cast(out_data))); return; } } } PADDLE_THROW(platform::errors::InvalidArgument( "hipFFT only support transforms of type float32 and float64")); -#else - CUFFT_CHECK(platform::dynload::cufftXtExec( - plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); -#endif } +template +void exec_hipfft_plan(const DeviceContext& ctx, const HIPFFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + math::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_hipfft_plan_raw(config, input_conj.data(), output->data(), + forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_hipfft_plan_raw(config, input->data(), out_conj.data(), + forward); + + platform::ForRange for_range(ctx, output->numel()); + math::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_hipfft_plan_raw(config, input->data(), output->data(), + forward); + } +} + +#endif + // Execute a general unnormalized fft operation (can be c2c, onesided r2c or // onesided c2r) template void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, const std::vector& dim, bool forward) { const auto x_dims = framework::vectorize(X->dims()); - const auto out_dims = framework::vectorize(out->dims()); const int64_t ndim = static_cast(X->dims().size()); - const int64_t signal_ndim = static_cast(dim.size()); - const int64_t batch_dims = ndim - signal_ndim; auto tensor_place = ctx.GetPlace(); - // Transpose batch dimensions first, then with transforming dims + // make a dim permutation std::vector dim_permute(ndim); - std::vector reverse_dim_permute(ndim); - std::vector trans_dims(ndim); std::iota(dim_permute.begin(), dim_permute.end(), int{0}); std::vector is_transformed_dim(ndim); for (const auto& d : dim) { @@ -340,160 +271,89 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, std::sort(dim_permute.begin(), batch_end); std::copy(dim.cbegin(), dim.cend(), batch_end); - for (size_t i = 0; i < ndim; i++) { - trans_dims[i] = x_dims[dim_permute[i]]; // shape of input transpose - reverse_dim_permute[dim_permute[i]] = - static_cast(i); // reverse of dim permute - } - framework::Tensor input; - input.Resize(framework::make_ddim(trans_dims)); - input.mutable_data(tensor_place); - /* - auto in_ret = TransposeSimple::run(ctx, *X, dim_permute, input); - if (!in_ret) { - TransCompute(ndim, ctx, *X, input, dim_permute); - } - */ - TransCompute(ndim, ctx, *X, &input, dim_permute); + // transpose input according to dim permutation + auto transposed_input_shape = X->dims().transpose(dim_permute); + framework::Tensor transposed_input; + transposed_input.Resize(transposed_input_shape); + transposed_input.mutable_data(tensor_place); + TransCompute(ndim, ctx, *X, &transposed_input, + dim_permute); // Reshape batch dimensions into a single dimension - std::vector batched_sizes(signal_ndim + 1); + const int64_t signal_ndim = static_cast(dim.size()); + std::vector collapsed_input_shape(signal_ndim + 1); + + auto transposed_input_shape_ = framework::vectorize(transposed_input_shape); + const int64_t batch_dims = ndim - signal_ndim; auto batch_size = - std::accumulate(trans_dims.begin(), trans_dims.begin() + batch_dims, + std::accumulate(transposed_input_shape_.begin(), + transposed_input_shape_.begin() + batch_dims, static_cast(1), std::multiplies()); - batched_sizes[0] = batch_size; - std::copy(trans_dims.begin() + batch_dims, trans_dims.end(), - batched_sizes.begin() + 1); - input.Resize(framework::make_ddim(batched_sizes)); + collapsed_input_shape[0] = batch_size; - // Check the shape of transforming dims with input and output - std::vector signal_size(signal_ndim + 1); - signal_size[0] = batch_size; - for (int64_t i = 0; i < signal_ndim; ++i) { - auto in_size = input.dims()[i + 1]; - auto out_size = out_dims[dim[i]]; - signal_size[i + 1] = std::max(in_size, out_size); - PADDLE_ENFORCE_EQ( - (in_size == signal_size[i + 1] || - in_size == (signal_size[i + 1] / 2) + 1), - true, - platform::errors::InvalidArgument( - "The dimension[%d] of Input size: [%d] must be equal or half to " - "The dimension[%d] of Output size: [%d]", - dim[i], in_size, dim[i], out_size)); - PADDLE_ENFORCE_EQ( - (out_size == signal_size[i + 1] || - out_size == (signal_size[i + 1] / 2) + 1), - true, - platform::errors::InvalidArgument( - "The dimension[%d] of Output size: [%d] must be equal or half to " - "The dimension[%d] of Input size: [%d]", - dim[i], out_size, dim[i], in_size)); - } + std::copy(transposed_input_shape_.begin() + batch_dims, + transposed_input_shape_.end(), collapsed_input_shape.begin() + 1); - std::vector reshape_out_sizes(ndim); - for (size_t i = 0; i < ndim; ++i) { - reshape_out_sizes[i] = out_dims[dim_permute[i]]; - } - std::vector batched_out_sizes(batched_sizes.begin(), - batched_sizes.end()); + framework::Tensor& collapsed_input = transposed_input; + collapsed_input.Resize(framework::make_ddim(collapsed_input_shape)); + + // make a collpased output + const auto out_dims = framework::vectorize(out->dims()); + std::vector collapsed_output_shape(1 + signal_ndim); + collapsed_output_shape[0] = batch_size; for (size_t i = 0; i < dim.size(); ++i) { - batched_out_sizes[i + 1] = out_dims[dim[i]]; + collapsed_output_shape[i + 1] = out_dims[dim[i]]; } - - // output - framework::Tensor output; - output.Resize(framework::make_ddim(batched_out_sizes)); - output.mutable_data(tensor_place); - - // Create the transform plan (either from cache or locally) - const auto value_type = framework::IsComplexType(input.type()) - ? framework::ToRealType(input.type()) - : input.type(); - auto fft_type = GetFFTTransformType(input.type(), output.type()); - - PlanKey Key(framework::vectorize(input.dims()), - framework::vectorize(output.dims()), signal_size, fft_type, - value_type); - CuFFTConfig uncached_plan(Key); - CuFFTConfig* config = &uncached_plan; - auto& plan = config->plan(); - + framework::Tensor collapsed_output; + collapsed_output.Resize(framework::make_ddim(collapsed_output_shape)); + collapsed_output.mutable_data(tensor_place); + +#if defined(PADDLE_WITH_CUDA) + // create plan + CuFFTConfig config = + create_cufft_config(collapsed_input, collapsed_output, signal_ndim); // prepare cufft for execution - CUFFT_CHECK(platform::dynload::cufftSetStream(plan, ctx.stream())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cufftSetStream(config.plan(), ctx.stream())); framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - CUFFT_CHECK( - platform::dynload::cufftSetWorkArea(plan, workspace_tensor.data())); + workspace_tensor.mutable_data(tensor_place, config.workspace_size()); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea( + config.plan(), workspace_tensor.data())); + // execute transform plan + exec_cufft_plan(ctx, config, &collapsed_input, + &collapsed_output, forward); +#elif defined(PADDLE_WITH_HIP) + // create plan + HIPFFTConfig config = + create_hipfft_config(collapsed_input, collapsed_output, signal_ndim); + // prepare cufft for execution + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::hipfftSetStream(config.plan(), ctx.stream())); + framework::Tensor workspace_tensor; + workspace_tensor.mutable_data(tensor_place, config.workspace_size()); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea( + config.plan(), workspace_tensor.data())); // execute transform plan - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input.type()); - input_conj.mutable_data(input.dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input.numel()); - math::ConjFunctor functor(input.data(), input.numel(), - input_conj.data()); - for_range(functor); - exec_cufft_plan(*config, input_conj.data(), output.data(), - forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output.type()); - out_conj.mutable_data(output.dims(), ctx.GetPlace()); - exec_cufft_plan(*config, input.data(), out_conj.data(), - forward); - - platform::ForRange for_range(ctx, output.numel()); - math::ConjFunctor functor(out_conj.data(), output.numel(), - output.data()); - for_range(functor); - } else { - exec_cufft_plan(*config, input.data(), output.data(), forward); - } + exec_hipfft_plan(ctx, config, &collapsed_input, + &collapsed_output, forward); +#endif // Inverting output by reshape and transpose to original batch and dimension - output.Resize(framework::make_ddim(reshape_out_sizes)); - out->Resize(framework::make_ddim(out_dims)); - TransCompute(ndim, ctx, output, out, reverse_dim_permute); -} + auto transposed_out_shape = out->dims().transpose(dim_permute); -// Calculates the normalization constant -double fft_normalization_scale(FFTNormMode normalization, - const std::vector& sizes, - const std::vector& dims) { - // auto norm = static_cast(normalization); - if (normalization == FFTNormMode::none) { - return static_cast(1.0); - } + collapsed_output.Resize(transposed_out_shape); + auto& transposed_output = collapsed_output; - int64_t signal_numel = 1; - for (auto dim : dims) { - signal_numel *= sizes[dim]; + std::vector reverse_dim_permute(ndim); + for (size_t i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; } - const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) - ? std::sqrt(signal_numel) - : static_cast(signal_numel); - return static_cast(1.0 / scale_denom); -} -template -void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, - FFTNormMode normalization, - const std::vector& sizes, - const std::vector& axes) { - double scale = fft_normalization_scale(normalization, sizes, axes); - if (scale != 1.0) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto dev = ctx.eigen_device(); - EigenScale::Eval(*dev, eigen_out, eigen_in, - static_cast(scale), - static_cast(0), false); - } else { - framework::TensorCopy(*in, ctx.GetPlace(), out); - } + TransCompute(ndim, ctx, transposed_output, out, + reverse_dim_permute); } + } // anonymous namespace // Use the optimized path to perform single R2C or C2R if transformation dim is diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 8c64aad46cfc8..6e90ccfc51e1b 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -7,7 +7,7 @@ if (NOT WITH_NV_JETSON) endif() if (WITH_ROCM) - list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) + list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc) endif() # There is no macOS version of NCCL. diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 0c5c47e38f85e..1bfd48b133907 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -356,6 +356,16 @@ void* GetCurandDsoHandle() { #endif } +#ifdef PADDLE_WITH_HIP +void* GetROCFFTDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.so"); +#endif +} +#endif + void* GetNvjpegDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 6260efdf71c59..1a66f4b979207 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -44,6 +44,7 @@ void* GetOpDsoHandle(const std::string& dso_name); void* GetNvtxDsoHandle(); void* GetCUFFTDsoHandle(); void* GetMKLRTDsoHandle(); +void* GetROCFFTDsoHandle(); void SetPaddleLibPath(const std::string&); } // namespace dynload diff --git a/paddle/fluid/platform/dynload/hipfft.cc b/paddle/fluid/platform/dynload/hipfft.cc new file mode 100644 index 0000000000000..767d2161be9d8 --- /dev/null +++ b/paddle/fluid/platform/dynload/hipfft.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/hipfft.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag hipfft_dso_flag; +void *hipfft_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HIPFFT_FFT_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/hipfft.h b/paddle/fluid/platform/dynload/hipfft.h new file mode 100644 index 0000000000000..50c25935e41b7 --- /dev/null +++ b/paddle/fluid/platform/dynload/hipfft.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#ifdef PADDLE_WITH_HIP +#include + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag hipfft_dso_flag; +extern void *hipfft_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using hipfftFunc = decltype(&::__name); \ + std::call_once(hipfft_dso_flag, []() { \ + hipfft_dso_handle = paddle::platform::dynload::GetROCFFTDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(hipfft_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define HIPFFT_FFT_ROUTINE_EACH(__macro) \ + __macro(hipfftPlan1d); \ + __macro(hipfftPlan2d); \ + __macro(hipfftPlan3d); \ + __macro(hipfftPlanMany); \ + __macro(hipfftMakePlan1d); \ + __macro(hipfftMakePlanMany); \ + __macro(hipfftMakePlanMany64); \ + __macro(hipfftGetSizeMany64); \ + __macro(hipfftEstimate1d); \ + __macro(hipfftEstimate2d); \ + __macro(hipfftEstimate3d); \ + __macro(hipfftEstimateMany); \ + __macro(hipfftCreate); \ + __macro(hipfftGetSize1d); \ + __macro(hipfftGetSizeMany); \ + __macro(hipfftGetSize); \ + __macro(hipfftSetWorkArea); \ + __macro(hipfftSetAutoAllocation); \ + __macro(hipfftExecC2C); \ + __macro(hipfftExecR2C); \ + __macro(hipfftExecC2R); \ + __macro(hipfftExecZ2Z); \ + __macro(hipfftExecD2Z); \ + __macro(hipfftExecZ2D); \ + __macro(hipfftSetStream); \ + __macro(hipfftDestroy); \ + __macro(hipfftGetVersion); \ + __macro(hipfftGetProperty); + +HIPFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HIPFFT_WRAP); + +inline const char *hipfftGetErrorString(hipfftResult_t status) { + switch (status) { + case HIPFFT_SUCCESS: + return "'HIPFFT_SUCCESS'. The hipFFT operation was successful."; + case HIPFFT_INVALID_PLAN: + return "'HIPFFT_INVALID_PLAN'. hipFFT was passed an invalid plan handle."; + case HIPFFT_ALLOC_FAILED: + return "'HIPFFT_ALLOC_FAILED'. hipFFT failed to allocate GPU or CPU " + "memory."; + case HIPFFT_INVALID_TYPE: + return "'HIPFFT_INVALID_TYPE'. No longer used."; + case HIPFFT_INVALID_VALUE: + return "'HIPFFT_INVALID_VALUE'. User specified an invalid pointer or " + "parameter."; + case HIPFFT_INTERNAL_ERROR: + return "'HIPFFT_INTERNAL_ERROR'. Driver or internal hipFFT library " + "error."; + case HIPFFT_EXEC_FAILED: + return "'HIPFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU."; + case HIPFFT_SETUP_FAILED: + return "'HIPFFT_SETUP_FAILED'. The hipFFT library failed to initialize."; + case HIPFFT_INVALID_SIZE: + return "'HIPFFT_INVALID_SIZE'. User specified an invalid transform size."; + case HIPFFT_UNALIGNED_DATA: + return "'HIPFFT_UNALIGNED_DATA'. No longer used."; + case HIPFFT_INCOMPLETE_PARAMETER_LIST: + return "'HIPFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call."; + case HIPFFT_INVALID_DEVICE: + return "'HIPFFT_INVALID_DEVICE'. Execution of a plan was on different " + "GPU than plan creation."; + case HIPFFT_PARSE_ERROR: + return "'HIPFFT_PARSE_ERROR'. Internal plan database error."; + case HIPFFT_NO_WORKSPACE: + return "'HIPFFT_NO_WORKSPACE'. No workspace has been provided prior to " + "plan execution."; + case HIPFFT_NOT_IMPLEMENTED: + return "'HIPFFT_NOT_IMPLEMENTED'. Function does not implement " + "functionality for parameters given."; + case HIPFFT_NOT_SUPPORTED: + return "'HIPFFT_NOT_SUPPORTED'. Operation is not supported for " + "parameters given."; + default: + return "HIPFFT_STATUS_UNKNOWN_ERROR"; + } +} +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 7427060add8b1..caa495bb7f8c5 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -86,6 +86,7 @@ limitations under the License. */ #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hipfft.h" #include "paddle/fluid/platform/dynload/hiprand.h" #include "paddle/fluid/platform/dynload/miopen.h" #include "paddle/fluid/platform/dynload/rocblas.h" @@ -1113,6 +1114,14 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { } #endif // not(__APPLE__) and PADDLE_WITH_NCCL +/***** HIPFFT ERROR *****/ +inline bool is_error(hipfftResult_t stat) { return stat != HIPFFT_SUCCESS; } + +inline std::string build_rocm_error_msg(hipfftResult_t stat) { + std::string msg(" HIPFFT error, "); + return msg + platform::dynload::hipfftGetErrorString(stat) + " "; +} + namespace details { template @@ -1129,6 +1138,7 @@ DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess); DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess); DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); +DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index c6d5f171ddce4..6ff9e6ea903cd 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -331,6 +331,10 @@ TEST(enforce, hip_success) { CheckCudaStatusFailure(rocblas_status_invalid_handle, "Rocblas error")); EXPECT_TRUE( CheckCudaStatusFailure(rocblas_status_invalid_value, "Rocblas error")); + EXPECT_TRUE(CheckCudaStatusSuccess(HIPFFT_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_INVALID_PLAN, "HIPFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error")); + #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error")); From a573a7ed7f4113cc7658b38f889e442bc805171e Mon Sep 17 00:00:00 2001 From: YipZLF <22539457+YipZLF@users.noreply.github.com> Date: Tue, 19 Oct 2021 14:03:46 +0800 Subject: [PATCH 022/116] Add auto parallel cost model and unittests (#36363) * Add auto parallel cost model and unittests * Fixed code styles. * Fixed bugs and codes style * fixed typo * Improved code style: object encapsulation. * Fixed codes. * Refractored estimate_cost * Fixed typo --- .../distributed/auto_parallel/__init__.py | 1 + .../distributed/auto_parallel/cost_model.py | 741 ++++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 3 + .../test_auto_parallel_cost_model.py | 236 ++++++ 4 files changed, 981 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/cost_model.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py index 31f92e2575a1f..2779a9feb0b83 100644 --- a/python/paddle/distributed/auto_parallel/__init__.py +++ b/python/paddle/distributed/auto_parallel/__init__.py @@ -21,5 +21,6 @@ from .completion import complete_annotation # noqa: F401 from .completion import complete_backward_annotation # noqa: F401 from .reshard import reshard # noqa: F401 +from .cost_model import estimate_cost __all__ = [] diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py new file mode 100644 index 0000000000000..3fd438e2a624a --- /dev/null +++ b/python/paddle/distributed/auto_parallel/cost_model.py @@ -0,0 +1,741 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +import json +import queue +import copy +from enum import Enum +import paddle + +SUCC = 0 # successor +PRED = 1 # predecessor + + +class CostNodeType(Enum): + DEFAULT = 0 + COMPUTATION = 1 + COMMUNICATION = 2 + VARIABLE = 3 + MERGED = 4 + NOP = 5 + + +class Cost(object): + def __init__(self): + self.runtime = None + self.static_mem = None + self.peak_mem = None + + +class CostModelMode(Enum): + DEFAULT = 0 + BENCHMARKING = 1 # costs based on trial runs + ANALYSIS = 2 # costs based on analysis + MIXED = 3 + + +class CostNode(object): + def __init__(self, node, node_type, id=None): + self.id = id + self.node = node + self.type = node_type + self._cost = 0 + self.is_optim = False + self.is_bwd = False + + @property + def cost(self): + return self._cost + + @cost.setter + def cost(self, cost): + if cost < 0: + raise ValueError('Cost must be above 0.') + self._cost = cost + + +class MergedOpsCostNode(CostNode): + def __init__(self, node_type, id=None, base_node_list=None, is_bwd=False): + super(MergedOpsCostNode, self).__init__(None, node_type, id) + self.node_list = base_node_list + self.is_bwd = is_bwd + + +class CommOpCostNode(CostNode): + def __init__(self, + node, + node_type, + id=None, + comm_node_list=None, + is_bwd=False): + super(CommOpCostNode, self).__init__(node, node_type, id) + self.node_list = comm_node_list + self.ranks = [] + self.comm_type = node.type + self.is_bwd = is_bwd + + def set_ranks(self, ranks): + self.ranks = ranks + + def set_shapes(self, input_shape, output_shape): + self.input_shape = input_shape + self.output_shape = output_shape + + def init_comm_cost(self, cluster=None): + # ref: https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md + # should get from `cluster` + BANDWIDTH = 32 * 1024 / 1000 # MB/ms, V100 PCIe + num_ranks = len(self.ranks) + comm_volumn = np.prod(self.input_shape) * 4 + + if 'allreduce' in self.comm_type: + self._cost = comm_volumn / (BANDWIDTH * num_ranks / + (2 * (num_ranks - 1))) + elif 'gather' in self.comm_type: + self._cost = comm_volumn / (BANDWIDTH * num_ranks / (num_ranks - 1)) + elif 'broadcast' in self.comm_type: + self._cost = comm_volumn / BANDWIDTH + elif 'send' in self.comm_type or 'recv' in self.comm_type: + self._cost = comm_volumn / BANDWIDTH + else: + self._cost = 0 + + +class TensorCostNode(CostNode): + def __init__(self, + node, + node_type, + id=None, + base_node_list=None, + batch_size=None, + shared_node_id=None): + super(TensorCostNode, self).__init__(node, node_type, id) + self.shape = node.shape + self.dtype = node.dtype + self.dtype_factor = 1 + self.persistable = None + self.shared_node_id = shared_node_id + if self.dtype == paddle.float32 or node.dtype == paddle.int32: + self.dtype_factor *= 4 + elif node.dtype == paddle.int64: + self.dtype_factor *= 8 + else: + raise NotImplementedError("{} not counted".format(v.node.dtype)) + + self.batch_size = None + if batch_size is not None: + self.batch_size = batch_size + + def get_size(self): + p = 1 + for i in self.node.shape: + if i == -1: # deal with placeholder + assert self.batch_size is not None, "Batch size not decided." + i = self.batch_size + p *= i + return p + + +class CompOpCostNode(CostNode): + def __init__(self, node, node_type, id=None, is_bwd=False, is_optim=False): + super(CompOpCostNode, self).__init__(node, node_type, id) + self.is_bwd = is_bwd + self.is_optim = is_optim + + def init_comp_cost(self, cost_data): + # TODO: improve fluid.CostModel for more specific cost_data + op_name = self.node.type + if op_name in cost_data.keys(): + self.cost = cost_data[op_name] + else: + self.cost = 0.0 + + +class PipeEvent(object): + def __init__(self, stage_id, event_name, duration, start_time=-1): + self.stage_id = stage_id + self.name = event_name + self.duration = duration + self.s_time = start_time + self.e_time = -1 + + +class CostModel(object): + def __init__(self, + mode=CostModelMode.BENCHMARKING, + cluster=None, + batch_size=1, + microbatch_num=1, + opcall_overhead=0, + standalone_cost_data=None, + pipeline_config=None): + self.mode = mode + + # parameters + self.opcall_overhead = opcall_overhead + self.batch_size = batch_size + self.microbatch_num = microbatch_num + + self.nodes = {} # name -> node + + self.origin_graph = {} # original graph + self.op_graph = {} # op graph (no variables nodes) + self.runtime_graph = {} # runtime graph, for simulation + + self.cluster = cluster + self.cost_data = standalone_cost_data + self.pp2rank = pipeline_config + if self.pp2rank is not None: + self.rank2pp = {} + for stage_idx, ranks in enumerate(self.pp2rank): + for rank in ranks: + self.rank2pp[rank] = stage_idx + else: + self.rank2pp = None + + self.ring2rank = {} + + self.fwd_time = [] + self.bwd_time = [] + self.optim_time = [] + + def _parse_sub_program(self, program, nodes, graph, cost_data, sub_idx): + assert len( + program.blocks) == 1, "Program more than 1 block not supported." + block = program.blocks[0] + + for var in block.vars.values(): + var_id = var.name + nodes[var_id] = TensorCostNode(var, CostNodeType.VARIABLE, var_id) + graph[var_id] = [[], []] + + for op in block.ops: + op_id = op.type + "_" + str(op.idx) + if op.type.startswith('c_') or op.type.startswith( + 'send') or op.type.startswith('recv'): + is_bwd = False + if op.type.startswith('c_'): + ring_id = op.attr('ring_id') + if ring_id not in self.ring2rank: + self.ring2rank[ring_id] = set() + self.ring2rank[ring_id].add(sub_idx) + is_bwd = '@GRAD' in op.output('Out')[0] + elif op.type.startswith('recv'): + is_bwd = '@GRAD' in op.output('Out')[0] + elif op.type.startswith('send'): + is_bwd = '@GRAD' in op.input('X')[0] + op_node = CommOpCostNode(op, CostNodeType.COMMUNICATION, op_id, + is_bwd) + else: + is_bwd = '_grad' in op.type + is_optim = 'LearningRate' in op.input_names + op_node = CompOpCostNode(op, CostNodeType.COMPUTATION, op_id, + is_bwd, is_optim) + op_node.init_comp_cost(cost_data) + + nodes[op_id] = op_node + graph[op_id] = [[], []] + + comm_input_shape = [0] + comm_output_shape = [0] + for i in range(len(op.input_names)): + try: + var_id = op.input(op.input_names[i])[0] + var_node = nodes[var_id] + graph[op_id][PRED].append(var_node.id) + graph[var_id][SUCC].append(op_node.id) + comm_input_shape = var_node.shape + except: + continue + for i in range(len(op.output_names)): + try: + var_id = op.output(op.output_names[i])[0] + var_node = nodes[var_id] + graph[op_id][SUCC].append(var_node.id) + graph[var_id][PRED].append(op_node.id) + comm_output_shape = var_node.shape + except: + continue + if op_node.type == CostNodeType.COMMUNICATION: + op_node.set_shapes(comm_input_shape, comm_output_shape) + + # resolve hazard: rename the r/w hazard variable nodes to ensure self.origin_graph is a DAG + new_var_dict = {} + for node_id, node in nodes.items(): + if node.type == CostNodeType.VARIABLE and node.node.persistable: + write_op_cnt = 0 + for pred_id in graph[node_id][PRED]: + pred = nodes[pred_id] + if pred.type == CostNodeType.COMPUTATION and ( + pred_id in graph[node_id][SUCC]): + + graph[pred_id][SUCC].remove(node_id) + graph[node_id][PRED].remove(pred_id) + + write_op_cnt += 1 + new_var_id = node_id + '_write_{}'.format(write_op_cnt) + new_var = TensorCostNode( + node.node, + CostNodeType.VARIABLE, + new_var_id, + shared_node_id=node_id) + + graph[new_var_id] = [[], []] + graph[pred_id][SUCC].append(new_var_id) + graph[new_var_id][PRED].append(pred_id) + + new_var_dict[new_var_id] = new_var + for k, v in new_var_dict.items(): + nodes[k] = v + return nodes + + def parse_program(self, distributed_program): + self.distributed_program = distributed_program + self.total_rank = len(self.distributed_program) + sub_prog_cnt = len(distributed_program) + self.nodes = [] * sub_prog_cnt + self.origin_graph = [] * sub_prog_cnt # original graph + self.op_graph = [] * sub_prog_cnt # op graph (no variables nodes) + self.runtime_graph = [] * sub_prog_cnt # runtime graph, for simulation + + for sub_idx, sub_prog in enumerate(distributed_program): + self.nodes.append({}) + self.origin_graph.append({}) + self.op_graph.append({}) + self.runtime_graph.append({}) + self._parse_sub_program( + sub_prog, self.nodes[sub_idx], self.origin_graph[sub_idx], + self.cost_data[0 if self.rank2pp is None else self.rank2pp[ + sub_idx]], sub_idx) + return self.nodes + + def _find_succ_op(self, node_id, sub_idx=0): + succ_ops_id = [] + for succ_id in self.origin_graph[sub_idx][node_id][SUCC]: + succ = self.nodes[sub_idx][succ_id] + if succ.type == CostNodeType.COMMUNICATION or \ + succ.type == CostNodeType.COMPUTATION: + succ_ops_id.append(succ_id) + elif succ.type == CostNodeType.VARIABLE: + succ_ops_id = succ_ops_id + self._find_succ_op(succ_id, sub_idx) + else: + raise NotImplementedError( + 'This type of node not supported yet:{}'.format(succ.type)) + return succ_ops_id + + def build_op_graph(self): + for sub_idx in range(self.total_rank): + op_nodes_id = [] + for node_id, node in self.nodes[sub_idx].items(): + if node.type == CostNodeType.VARIABLE: + continue + self.op_graph[sub_idx][node_id] = [[], []] + op_nodes_id.append(node_id) + for op_id in op_nodes_id: + succ_nodes_id = self._find_succ_op(op_id, sub_idx) + + self.op_graph[sub_idx][op_id][SUCC] = succ_nodes_id + for succ_id in succ_nodes_id: + self.op_graph[sub_idx][succ_id][PRED].append(op_id) + + def build_runtime_graph(self): + self.runtime_graph = copy.deepcopy(self.op_graph) + + def eliminate_multi_edges(self, graph=None): + for node_id, edges in graph.items(): + graph[node_id][PRED] = list(set(edges[PRED])) + graph[node_id][SUCC] = list(set(edges[SUCC])) + + def merge_comm(self): + for sub_idx in range(self.total_rank): + for node_id, edges in self.op_graph[sub_idx].items(): + node = self.nodes[sub_idx][node_id] + if node_id.startswith('c_'): + ring_id = node.node.attr('ring_id') + node.set_ranks(list(self.ring2rank[ring_id])) + node.init_comm_cost(self.cluster) + elif node_id.startswith('send') or node_id.startswith('recv'): + peer_rank = node.node.attr('peer') + node.set_ranks([sub_idx, peer_rank]) + node.init_comm_cost(self.cluster) + else: + pass # Not communication op + + def _merge_node(self, to_merge_node_list, merge_type='linear', nodes=None): + nodes_list = [] + node_cost = 0 + for node in to_merge_node_list: + if isinstance(node, MergedOpsCostNode): + nodes_list += node.node_list + else: + nodes_list.append(node.id) + if merge_type == 'linear': + node_cost += node.cost + elif merge_type == 'branch': + node_cost = max(node_cost, node.cost) + else: + raise NotImplementedError( + 'This type of merging is not supported:{}'.format( + merge_type)) + merged_node_id = 'merged_' + str(len(nodes)) + is_bwd = to_merge_node_list[0].is_bwd + merged_node = MergedOpsCostNode( + CostNodeType.MERGED, + id=merged_node_id, + base_node_list=nodes_list, + is_bwd=is_bwd) + merged_node.cost = node_cost + return merged_node_id, merged_node + + def merge_linear(self): + ''' + This method does the following: + If X depends on Y only, they must be run sequentially. + [ e.g. A ->- C ->- D D and E depends on C only.] + [ B ->-/ \->- E C depends on A and B. ] + We merge X and Y into a new node and sum up their cost time. + ''' + cnt = 0 + for sub_idx in range(self.total_rank): + cnt += self._merge_linear( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False) + cnt += self._merge_linear( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True) + return cnt + + def merge_branch(self): + ''' + This method does the following: + If a node has more than one successor, there is *branch*. + [ e.g. A ->- B ->- D ] + [ \->- C ->- / , B and C can be run at the same time ] + case 1: if B or C is null (or D is directly dependent on A), + it's equivalent to A->C->D or A->B->D, fall back to self.merge_linear + case 2: if both B and C are some op, + merged_cost = max(cost(B), cost(C)) + ''' + cnt = 0 + for sub_idx in range(self.total_rank): + cnt += self._merge_branch( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=False) + cnt += self._merge_branch( + self.nodes[sub_idx], self.runtime_graph[sub_idx], is_bwd=True) + return cnt + + def _merge_linear(self, nodes, runtime_graph, is_bwd=False): + reduct_cnt = 0 + rt_nodes_id = list(runtime_graph.keys()) + for node_id in rt_nodes_id: + if node_id not in runtime_graph.keys(): + continue + node = nodes[node_id] + if not is_bwd == node.is_bwd or node.is_optim: + continue + edges = runtime_graph[node_id] + ind = len(edges[PRED]) # in_degree + if ind == 1: # only depend on one node + pred_id = edges[PRED][0] + pred = nodes[pred_id] + merged_node_id, merged_node = self._merge_node( + [node, pred], merge_type='linear', nodes=nodes) + nodes[merged_node_id] = merged_node + runtime_graph[merged_node_id] = [[], []] + + # delete edges and add new edges + succ = None + runtime_graph[merged_node_id][SUCC] = copy.deepcopy(edges[SUCC]) + if len(runtime_graph[pred_id][SUCC]) > 1: + # predecessor has more than 1 successor + # the merged_node is to inherit the rest of its successors + succ = runtime_graph[pred_id][SUCC] + succ.remove(node_id) + runtime_graph[merged_node_id][SUCC] += succ + runtime_graph[merged_node_id][PRED] = runtime_graph[pred_id][ + PRED] + for i in runtime_graph[pred_id][PRED]: + runtime_graph[i][SUCC].remove(pred_id) + runtime_graph[i][SUCC].append(merged_node_id) + + for i in edges[SUCC]: + runtime_graph[i][PRED].remove(node_id) + runtime_graph[i][PRED].append(merged_node_id) + if succ is not None: + for i in succ: + runtime_graph[i][PRED].remove(pred_id) + runtime_graph[i][PRED].append(merged_node_id) + + runtime_graph.pop(node_id) + runtime_graph.pop(pred_id) + reduct_cnt += 1 + self.eliminate_multi_edges(runtime_graph) + return reduct_cnt # the number of nodes that have been reduced + + def _merge_branch(self, nodes, runtime_graph, is_bwd=False): + reduct_cnt = 0 + rt_nodes_id = list(runtime_graph.keys()) + for node_id in rt_nodes_id: + node = nodes[node_id] + if not is_bwd == node.is_bwd or node.is_optim: + continue + edges = runtime_graph[node_id] + outd = len(edges[SUCC]) # out_degree + if outd > 1: # branch out + succ_nodes_id = edges[SUCC] + + succ_to_elim = [] + for succ_id in succ_nodes_id: + for succ_2_id in succ_nodes_id: + tmp = runtime_graph[succ_2_id][SUCC] + if succ_id in tmp: + succ_to_elim.append(succ_id) + break + for id in succ_to_elim: + edges[SUCC].remove(id) + runtime_graph[id][PRED].remove(node_id) + reduct_cnt += 1 + + to_merge = True + if len(edges[SUCC]) < 1 or len(runtime_graph[edges[SUCC][0]][ + SUCC]) < 1: + continue + end_node_id = runtime_graph[edges[SUCC][0]][SUCC][0] + for i in succ_nodes_id: + if len(runtime_graph[i][SUCC]) != 1 or \ + runtime_graph[i][SUCC][0] != end_node_id: + to_merge = False # if branches has different end node, we don't merge them + break + if to_merge: + to_merge_node_list = [nodes[i] for i in succ_nodes_id] + merged_node_id, merged_node = self._merge_node( + to_merge_node_list, merge_type='branch', nodes=nodes) + nodes[merged_node_id] = merged_node + runtime_graph[merged_node_id] = [[], []] + + # delete edges and add new edges + runtime_graph[merged_node_id][SUCC] = [end_node_id] + runtime_graph[merged_node_id][PRED] = edges[PRED] + + runtime_graph[end_node_id][PRED] = [merged_node_id] + runtime_graph[node_id][SUCC] = [merged_node_id] + + for i in succ_nodes_id: + runtime_graph.pop(i) + reduct_cnt += len(to_merge_node_list) - 1 + return reduct_cnt + + def get_runtime_cost(self): + def get_node_cost(node): + node_cost = node.cost + self.opcall_overhead + if isinstance(node, MergedOpsCostNode): + for it in node.node_list: + node_cost += self.opcall_overhead + return node_cost + + for sub_idx in range(self.total_rank): + fwd_cost = 0 + bwd_cost = 0 + optim_cost = 0 + for node_id in self.runtime_graph[sub_idx].keys(): + node = self.nodes[sub_idx][node_id] + if node.is_optim: + optim_cost += get_node_cost(node) + elif node.is_bwd: + bwd_cost += get_node_cost(node) + else: + fwd_cost += get_node_cost(node) + self.fwd_time.append(fwd_cost) + self.bwd_time.append(bwd_cost) + self.optim_time.append(optim_cost) + return self.fwd_time, self.bwd_time, self.optim_time + + def get_mem(self): + static_list = [] + top_list = [] + for sub_idx in range(self.total_rank): + static_mem, cur_mem, top_mem = self._simulate_mem( + self.nodes[sub_idx], self.origin_graph[sub_idx]) + static_list.append(static_mem) + top_list.append(top_mem) + return static_list, top_list + + def _simulate_mem(self, nodes, origin_graph): + q = queue.Queue(1024) + sim_graph = copy.deepcopy(origin_graph) + for node_id, node in nodes.items(): + if len(sim_graph[node_id][PRED]) == 0: + q.put(node_id) + + q.put('nop') + cur_mem = 0 + top_mem = -1 + static_mem = 0 + while not q.empty(): + node_id = q.get() + node = None + size = 0 + if node_id == 'nop': + top_mem = max(cur_mem, top_mem) + if q.empty(): + break + else: + q.put(node_id) + continue + else: + node = nodes[node_id] + if node.type == CostNodeType.VARIABLE: + size = node.get_size() + if node.node.persistable: + static_mem += size + cur_mem += size + edges = sim_graph[node_id] + if not (node.type == CostNodeType.VARIABLE and + node.node.persistable): + for succ_id in edges[SUCC]: + sim_graph[succ_id][PRED].remove(node_id) + if len(sim_graph[succ_id][PRED]) == 0: + q.put(succ_id) + for pred_id in edges[PRED]: + pred = nodes + if pred.type == CostNodeType.VARIABLE: + sim_graph[pred_id][SUCC].remove(node_id) + if len(sim_graph[pred_id][ + SUCC]) == 0 and not pred.node.persistable: + cur_mem -= pred.get_size() + return static_mem, cur_mem, top_mem + + def get_pipeline_time(self): + if self.total_rank <= 1: + return self.fwd_time[0] + self.bwd_time[0] + self.optim_time[0] + else: + return self._simulate_pipeline() + + def _simulate_pipeline(self): + stage_num = len(self.pp2rank) + event_list = [] + global_time = [0] * stage_num + total_time = 0 + fwd_cnt = list(range(stage_num, 0, -1)) + bwd_cnt = [self.microbatch_num] * stage_num + q = queue.Queue(1024) + + for i in range(self.microbatch_num): + q.put(PipeEvent(0, 'fwd', self.fwd_time[0])) + + while not q.empty(): + e = q.get() + stid = e.stage_id + if e.name == 'fwd': + if fwd_cnt[stid] > 0: + e.s_time = max(global_time[stid], e.s_time) + e.e_time = e.s_time + e.duration + event_list.append(e) + if stid != stage_num - 1: + q.put( + PipeEvent( + stid + 1, + 'fwd', + self.fwd_time[stid + 1], + start_time=e.e_time)) + else: + q.put( + PipeEvent( + stid, + 'bwd', + self.bwd_time[stid], + start_time=e.e_time)) + fwd_cnt[stid] -= 1 + global_time[stid] = e.e_time + else: + q.put(e) + elif e.name == 'bwd': + e.s_time = max(global_time[stid], e.s_time) + e.e_time = e.s_time + e.duration + event_list.append(e) + if stid != 0: + q.put( + PipeEvent( + stid - 1, + 'bwd', + self.bwd_time[stid - 1], + start_time=e.e_time)) + fwd_cnt[stid] += 1 + bwd_cnt[stid] -= 1 + if bwd_cnt[stid] == 0: + q.put( + PipeEvent( + stid, + 'optim', + self.optim_time[stid], + start_time=e.e_time)) + global_time[stid] = e.e_time + elif e.name == 'optim': + e.s_time = max(global_time[stid], e.s_time) + e.e_time = e.s_time + e.duration + event_list.append(e) + global_time[stid] = e.e_time + else: + raise NotImplementedError( + 'This type of pipe event is not supported yet.{}'.format( + e.name)) + + for t in global_time: + total_time = max(total_time, t) + return total_time + + def get_cost(self): + cost = Cost() + static_mem, peak_mem = self.get_mem() + cost.static_mem = static_mem + cost.peak_mem = peak_mem + self.merge_comm() + while True: + cnt = 0 + cnt += self.merge_linear() + cnt += self.merge_branch() + if cnt == 0: # can't be further merged + break + self.get_runtime_cost() + cost.runtime = self.get_pipeline_time() + return cost + + def init(self, distributed_program): + self.parse_program(distributed_program) + self.build_op_graph() + for sub_idx in range(self.total_rank): + self.eliminate_multi_edges(self.op_graph[sub_idx]) + self.build_runtime_graph() + + +def estimate_cost(distributed_program, cluster, pipeline_config, + standalone_cost_data, batch_size): + """ + Estimated cost from distributed program, cluster model and distributed settings. + + Args: + distributed_program(list): list of paddle programs + cluster(Cluster): cluster model + standalone_cost_data(CostData): cost data given by paddle.core + batch_size(int): batch size of the training workload + pipeline_config(list): configuration of pipeline stage allocation + """ + # the following line is left for now, cluster model will be involved in the future + assert cluster is None, "For now, cluster remains None" + cm_ctx = CostModel( + cluster=cluster, + batch_size=batch_size, + standalone_cost_data=standalone_cost_data, + pipeline_config=pipeline_config) + cm_ctx.init(distributed_program) + cost = cm_ctx.get_cost() + return cost diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f883d7a80a412..90f59758a2faf 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -91,6 +91,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_cost_model) foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() @@ -234,6 +235,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_cost_model) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) @@ -608,6 +610,7 @@ if(WITH_DISTRIBUTE) py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_cost_model MODULES test_auto_parallel_cost_model ENVS ${dist_ENVS}) endif(NOT WIN32) endif(NOT APPLE) if(WITH_DGC) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py new file mode 100644 index 0000000000000..58d033ad65831 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py @@ -0,0 +1,236 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.cost_model import estimate_cost +import paddle.fluid.core as core + +paddle.enable_static() +_global_parallel_strategy = "dp_mp_pp" +ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]]) +_global_process_mesh = auto.ProcessMesh( + [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH) +PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH) +PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH) +NUM_RANKS = 8 +STAGE_0_CNT = 5 +STAGE_1_CNT = 10 +pp_cfg = [[0, 1, 4, 5], [2, 3, 6, 7]] + +device = "gpu" if core.is_compiled_with_cuda() else "cpu" + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=256, + intermediate_size=4 * 256, + initializer_range=0.02, + is_distributed=True): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + self.is_distributed = is_distributed + + def forward(self, input): + if self.is_distributed: + auto.shard_tensor( + self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1]) + auto.shard_tensor( + self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def get_single_node_data(): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + + loss, train_program, startup_program = mlp_forward( + train_program, startup_program, is_distributed=False) + + cost_model = core.CostModel() + cost_data = cost_model.profile_measure(train_program, startup_program, + device, ["time"]) + + op_name2cost = [{}, {}] + for idx, op in enumerate(train_program.blocks[0].ops): + if idx <= STAGE_0_CNT: + op_name2cost[0][op.type] = cost_data.get_op_time_ms(idx) + elif idx <= STAGE_1_CNT: + op_name2cost[1][op.type] = cost_data.get_op_time_ms(idx) + return op_name2cost + + +def mlp_forward(train_program, start_program, is_distributed=True): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 256 + sequence_len = 128 + if is_distributed: + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + else: + input = paddle.ones( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = paddle.ones( + name="label", shape=[batch_size, 1], dtype='float32') + + if is_distributed: + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02, + is_distributed=is_distributed) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + dist_main_prog = [] + dist_startup_prog = [] + for rank_id in range(NUM_RANKS): + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, + auto_parallel_main_prog, auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + dist_main_prog.append(auto_parallel_main_prog) + dist_startup_prog.append(auto_parallel_startup_prog) + return dist_main_prog, dist_startup_prog + + +def check_runtime_estimation(cost): + return cost.runtime > 0 + + +def check_memory_estimation(cost): + for i in range(NUM_RANKS): + if cost.static_mem[i] <= 0 or cost.peak_mem[i] <= 0: + return False + if cost.static_mem[i] > cost.peak_mem[i]: + return False + return True + + +def check_empty_program_runtime(cost): + return cost.runtime == 0 + + +def check_empty_program_memory(cost): + for mem in cost.peak_mem: + if mem > 0: + return False + for mem in cost.static_mem: + if mem > 0: + return False + return True + + +class TestCostModel(unittest.TestCase): + def test_empty_program_cost_model(self): + empty_program = paddle.static.Program() + startup_program = paddle.static.Program() + standalone_cost_data = [{}] + empty_pp_cfg = None + cluster = None + cost = estimate_cost( + [empty_program], + cluster=cluster, + pipeline_config=empty_pp_cfg, + standalone_cost_data=standalone_cost_data, + batch_size=1) + + self.assertTrue(check_empty_program_runtime(cost)) + self.assertTrue(check_empty_program_memory(cost)) + + def test_auto_parallel_cost_model(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + standalone_cost_data = get_single_node_data() + distributed_program, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, 0) + for rank_id in range(NUM_RANKS): + complete_backward_annotation(distributed_program[rank_id], + dist_context) + reshard(distributed_program[rank_id], dist_startup_prog[rank_id], + rank_id, dist_context) + cluster = None + cost = estimate_cost( + distributed_program, + cluster=cluster, + pipeline_config=pp_cfg, + standalone_cost_data=standalone_cost_data, + batch_size=4) + self.assertTrue(check_runtime_estimation(cost)) + self.assertTrue(check_memory_estimation(cost)) + + +if __name__ == "__main__": + unittest.main() From 34d785c22803db1d45148f8dfd175cbaae05a485 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Tue, 19 Oct 2021 14:10:27 +0800 Subject: [PATCH 023/116] [paddle.linalg.qr] Add the Qr Operator (#35742) * Add QR decomposition op * Change codes to adapt to new svd_helper * Update linalg.py Restore the deleted comma * Restore the deleted line * Update linalg.py * Update linalg.py * Improve the qr code by reviews * Update QR based on CI results * Update qr doc, test=document_fix * Change unsafe and ill-formed codes --- cmake/operators.cmake | 1 + paddle/fluid/operators/qr_op.cc | 152 +++++++++ paddle/fluid/operators/qr_op.cu | 309 ++++++++++++++++++ paddle/fluid/operators/qr_op.h | 135 ++++++++ paddle/fluid/operators/svd_helper.h | 13 + paddle/fluid/platform/dynload/cusolver.h | 18 +- .../fluid/tests/unittests/test_qr_op.py | 173 ++++++++++ python/paddle/linalg.py | 2 + python/paddle/tensor/__init__.py | 2 + python/paddle/tensor/linalg.py | 66 +++- 10 files changed, 869 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/qr_op.cc create mode 100644 paddle/fluid/operators/qr_op.cu create mode 100644 paddle/fluid/operators/qr_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_qr_op.py diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 228da9f77739d..5eecbefa2fcfb 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -185,6 +185,7 @@ function(op_library TARGET) list(REMOVE_ITEM hip_srcs "cholesky_op.cu") list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu") list(REMOVE_ITEM hip_srcs "svd_op.cu") + list(REMOVE_ITEM hip_srcs "qr_op.cu") list(REMOVE_ITEM hip_srcs "eigh_op.cu") list(REMOVE_ITEM hip_srcs "multinomial_op.cu") list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc new file mode 100644 index 0000000000000..f612bb9e31f93 --- /dev/null +++ b/paddle/fluid/operators/qr_op.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/qr_op.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { +using DDim = framework::DDim; + +class QrOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr"); + OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr"); + OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr"); + + auto x_dims = ctx->GetInputDim("X"); + int x_rank = x_dims.size(); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + platform::errors::InvalidArgument( + "the rank of input must greater than 2")); + bool compute_q; + bool reduced_mode; + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + std::string mode = ctx->Attrs().Get("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + if (compute_q) { + int k = reduced_mode ? min_mn : m; + auto q_dims_vec = framework::vectorize(x_dims); + q_dims_vec[q_dims_vec.size() - 1] = k; + ctx->SetOutputDim("Q", framework::make_ddim(q_dims_vec)); + } else { + ctx->SetOutputDim("Q", framework::make_ddim({0})); + } + + int k = reduced_mode ? min_mn : m; + auto r_dims_vec = framework::vectorize(x_dims); + r_dims_vec[r_dims_vec.size() - 2] = k; + r_dims_vec[r_dims_vec.size() - 1] = n; + ctx->SetOutputDim("R", framework::make_ddim(r_dims_vec)); + + ctx->ShareLoD("X", /*->*/ "Q"); + ctx->ShareLoD("X", /*->*/ "R"); + } +}; + +class QrOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of qr op."); + AddOutput("Q", "(Tensor), The output Q tensor of qr op."); + AddOutput("R", "(Tensor), The output R tensor of qr op."); + AddAttr( + "mode", + "(string, default \"reduced\"). " + "If mode is \"reduced\", Qr op will return reduced Q and R matrices. " + "If mode is \"complete\", Qr op will return complete Q and R matrices. " + "If mode is \"r\", Qr op will only return reduced R matrix.") + .SetDefault("reduced"); + AddComment(R"DOC( +Qr Operator. + +This operator is used to perform QR operation for batched matrics $X$. +$$Q, R = qr(X)$$ + +)DOC"); + } +}; + +class QrGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Q")), "Input", + "Q@Grad", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("R")), "Input", + "R@Grad", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput("R"), "Input", "R", "QrGrad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + "X@Grad", "QrGrad"); + + auto x_dims = ctx->GetInputDim(("X")); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +template +class QrGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("qr_grad"); + retv->SetInput(framework::GradVarName("Q"), this->OutputGrad("Q")); + retv->SetInput(framework::GradVarName("R"), this->OutputGrad("R")); + retv->SetInput("Q", this->Output("Q")); + retv->SetInput("R", this->Output("R")); + retv->SetInput("X", this->Input("X")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, + ops::QrGradMaker, + ops::QrGradMaker); + +REGISTER_OPERATOR(qr_grad, ops::QrGradOp); + +REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel, ops::QrCPUKernel); + +REGISTER_OP_CPU_KERNEL( + qr_grad, ops::QrGradKernel, + ops::QrGradKernel); diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu new file mode 100644 index 0000000000000..992df172ace0c --- /dev/null +++ b/paddle/fluid/operators/qr_op.cu @@ -0,0 +1,309 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/qr_op.h" +#include "paddle/fluid/platform/dynload/cusolver.h" + +// Reuse some helper functions from svd +#include "paddle/fluid/operators/svd_helper.h" + +namespace paddle { +namespace operators { + +template +class QrGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool compute_q; + bool reduced_mode; + auto& dev_ctx = + context.template device_context(); + const Tensor& x = *context.Input("X"); + Tensor& q = *context.Output("Q"); + Tensor& r = *context.Output("R"); + const std::string mode = context.Attr("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + auto numel = x.numel(); + PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( + "The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int qr_stride = m * n; + int tau_stride = min_mn; + + if (compute_q) { + q.mutable_data>( + context.GetPlace(), + size_t(batch_size * m * k * sizeof(math::Real))); + } + r.mutable_data>( + context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); + + auto dito = + math::DeviceIndependenceTensorOperations(context); + + // Note: allocate temporary tensors because of lacking in-place operatios. + // Prepare qr + Tensor qr; + qr.mutable_data>( + context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real))); + // BatchedGeqrf performs computation in-place and 'qr' must be a copy of + // input + TensorCopy(x, context.GetPlace(), &qr); + + // Prepare tau + auto tau_dims_vec = framework::vectorize(x_dims); + tau_dims_vec.pop_back(); + tau_dims_vec[tau_dims_vec.size() - 1] = min_mn; + Tensor tau = dito.Fill(tau_dims_vec, 0); + + // Transpose 'qr' to conform the column-major order + auto tmp_qr = dito.Transpose(qr); + framework::TensorCopy(tmp_qr, qr.place(), &qr); + auto qr_data = qr.mutable_data(context.GetPlace()); + auto tau_data = tau.mutable_data(context.GetPlace()); + + BatchedGeqrf(dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, + tau_stride); + + if (reduced_mode) { + auto trans_qr = dito.Transpose(qr); + auto sliced_qr = dito.Slice(trans_qr, {-2}, {0}, {min_mn}); + auto tmp_r = dito.TrilTriu(sliced_qr, 0, false); + // Transpose 'tmp_r' to retore the original row-major order + framework::TensorCopy(tmp_r, r.place(), &r); + } else { + auto trans_qr = dito.Transpose(qr); + auto tmp_r = dito.TrilTriu(trans_qr, 0, false); + // Transpose 'tmp_r' to retore the original row-major order + framework::TensorCopy(tmp_r, r.place(), &r); + } + + if (compute_q) { + // Perform QRGQR for Q using the result from GEQRF + // Transpose 'q' to retore the original row-major order + if (reduced_mode) { + BatchedOrgqr(dev_ctx, batch_size, m, min_mn, min_mn, qr_data, m, + tau_data, qr_stride, tau_stride); + auto trans_q = dito.Transpose(qr); + auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn}); + framework::TensorCopy(sliced_q, q.place(), &q); + } else { + if (m > n) { + auto new_qr_dims_vec = framework::vectorize(x_dims); + new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m; + Tensor new_qr = dito.Fill(new_qr_dims_vec, 0); + auto new_qr_data = new_qr.mutable_data(context.GetPlace()); + auto new_qr_stride = m * m; + for (int i = 0; i < batch_size; ++i) { + memory::Copy( + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + (new_qr_data + i * new_qr_stride), + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + (qr_data + i * qr_stride), qr_stride * sizeof(math::Real), + dev_ctx.stream()); + } + BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, + tau_data, new_qr_stride, tau_stride); + auto trans_q = dito.Transpose(new_qr); + framework::TensorCopy(trans_q, q.place(), &q); + } else { + BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, qr_data, m, tau_data, + qr_stride, tau_stride); + auto trans_q = dito.Transpose(qr); + auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m}); + framework::TensorCopy(sliced_q, q.place(), &q); + } + } + } + } + + void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, float* a, int lda, float* tau, int a_stride, + int tau_stride) const; + + void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, double* a, int lda, double* tau, int a_stride, + int tau_stride) const; + + void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, int k, float* a, int lda, float* tau, + int a_stride, int tau_stride) const; + + void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, int k, double* a, int lda, double* tau, + int a_stride, int tau_stride) const; +}; + +template <> +void QrGPUKernel::BatchedGeqrf( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + float* a, int lda, float* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize( + handle, m, n, a, lda, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + float* a_working_ptr = &a[i * a_stride]; + float* tau_working_ptr = &tau[i * tau_stride]; + // compute geqrf + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf( + handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, + info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedGeqrf( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + double* a, int lda, double* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize( + handle, m, n, a, lda, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + double* a_working_ptr = &a[i * a_stride]; + double* tau_working_ptr = &tau[i * tau_stride]; + // compute geqrf + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf( + handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, + info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedOrgqr( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + int k, float* a, int lda, float* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize( + handle, m, n, k, a, lda, tau, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + float* a_working_ptr = &a[i * a_stride]; + float* tau_working_ptr = &tau[i * tau_stride]; + // compute orggr + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr( + handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, + lwork, info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedOrgqr( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + int k, double* a, int lda, double* tau, int a_stride, + int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize( + handle, m, n, k, a, lda, tau, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + double* a_working_ptr = &a[i * a_stride]; + double* tau_working_ptr = &tau[i * tau_stride]; + // compute orggr + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr( + handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, + lwork, info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h)); + } +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel, ops::QrGPUKernel); +REGISTER_OP_CUDA_KERNEL( + qr_grad, ops::QrGradKernel, + ops::QrGradKernel); + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h new file mode 100644 index 0000000000000..73ba52f590c0d --- /dev/null +++ b/paddle/fluid/operators/qr_op.h @@ -0,0 +1,135 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +static inline std::tuple _parse_qr_mode(std::string mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "QR received unrecognized mode '%s'" + " but expected one of 'reduced' (default), 'r', or 'complete'", + mode)); + } + return std::make_tuple(compute_q, reduced); +} + +template +class QrCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool compute_q; + bool reduced_mode; + const Tensor& x = *context.Input("X"); + Tensor& q = *context.Output("Q"); + Tensor& r = *context.Output("R"); + std::string mode = context.Attr("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + auto numel = x.numel(); + PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( + "The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int x_stride = m * n; + int q_stride = m * k; + int r_stride = k * n; + + auto* x_data = x.data>(); + T* q_data = nullptr; + if (compute_q) { + q_data = q.mutable_data>( + context.GetPlace(), + size_t(batch_size * m * k * sizeof(math::Real))); + } + auto* r_data = r.mutable_data>( + context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); + + // Implement QR by calling Eigen + for (int i = 0; i < batch_size; ++i) { + const T* x_matrix_ptr = x_data + i * x_stride; + T* r_matrix_ptr = r_data + i * r_stride; + using EigenDynamicMatrix = + Eigen::Matrix; + auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); + Eigen::HouseholderQR qr(x_matrix); + if (reduced_mode) { + auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); + auto r_matrix_view = + qr_top_matrix.template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } else { + auto r_matrix_view = + qr.matrixQR().template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } + + if (compute_q) { + T* q_matrix_ptr = q_data + i * q_stride; + if (reduced_mode) { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } else { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, m); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } + } + } + } +}; + +template +class QrGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + PADDLE_THROW(platform::errors::InvalidArgument( + "QR doesn't have the backward kernel now and will be supported soon.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index 9ba7c9a3062a0..6b2584682277e 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -502,6 +502,19 @@ struct DeviceIndependenceTensorOperations { return ret; } + framework::Tensor TrilTriu(const framework::Tensor& x, int diagonal, + bool lower) { + framework::AttributeMap attrs; + attrs["diagonal"] = diagonal; + attrs["lower"] = lower; + NameInTensorMap inputs({{"X", {&x}}}); + int x_rank = x.dims().size(); + PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument( + "Rank must be at least 2.")); + std::vector out_shape = framework::vectorize(x.dims()); + return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape); + } + Tensor Conj(const Tensor& x) { Tensor out; auto* out_data = out.mutable_data(x.dims(), context.GetPlace()); diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h index a8ce1cc9d3a35..4c018908b5945 100644 --- a/paddle/fluid/platform/dynload/cusolver.h +++ b/paddle/fluid/platform/dynload/cusolver.h @@ -65,11 +65,27 @@ CUSOLVER_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP); __macro(cusolverDnSpotrfBatched); \ __macro(cusolverDnDpotrfBatched); \ __macro(cusolverDnSgesvdj_bufferSize); \ + __macro(cusolverDnSgeqrf_bufferSize); \ + __macro(cusolverDnDgeqrf_bufferSize); \ + __macro(cusolverDnCgeqrf_bufferSize); \ + __macro(cusolverDnZgeqrf_bufferSize); \ + __macro(cusolverDnSorgqr_bufferSize); \ + __macro(cusolverDnDorgqr_bufferSize); \ + __macro(cusolverDnCungqr_bufferSize); \ + __macro(cusolverDnZungqr_bufferSize); \ __macro(cusolverDnDestroyGesvdjInfo); \ __macro(cusolverDnCreateGesvdjInfo); \ __macro(cusolverDnDgesvdj_bufferSize); \ __macro(cusolverDnSgesvdj); \ - __macro(cusolverDnDgesvdj); + __macro(cusolverDnDgesvdj); \ + __macro(cusolverDnSgeqrf); \ + __macro(cusolverDnDgeqrf); \ + __macro(cusolverDnCgeqrf); \ + __macro(cusolverDnZgeqrf); \ + __macro(cusolverDnSorgqr); \ + __macro(cusolverDnDorgqr); \ + __macro(cusolverDnCungqr); \ + __macro(cusolverDnZungqr); CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif diff --git a/python/paddle/fluid/tests/unittests/test_qr_op.py b/python/paddle/fluid/tests/unittests/test_qr_op.py new file mode 100644 index 0000000000000..ea2aaf3f00d5b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_qr_op.py @@ -0,0 +1,173 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import itertools +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle.fluid.core as core + + +class TestQrAPI(unittest.TestCase): + def test_dygraph(self): + paddle.disable_static() + + def run_qr_dygraph(shape, mode, dtype): + if dtype == "float32": + np_dtype = np.float32 + elif dtype == "float64": + np_dtype = np.float64 + a = np.random.rand(*shape).astype(np_dtype) + m = a.shape[-2] + n = a.shape[-1] + min_mn = min(m, n) + if mode == "reduced" or mode == "r": + k = min_mn + else: + k = m + np_q_shape = list(a.shape[:-2]) + np_q_shape.extend([m, k]) + np_r_shape = list(a.shape[:-2]) + np_r_shape.extend([k, n]) + np_q = np.zeros(np_q_shape).astype(np_dtype) + np_r = np.zeros(np_r_shape).astype(np_dtype) + places = [] + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for place in places: + batch_size = a.size // (a.shape[-1] * a.shape[-2]) + for i in range(batch_size): + coord = np.unravel_index(i, a.shape[:-2]) + if mode == "r": + tmp_r = np.linalg.qr(a[coord], mode=mode) + np_r[coord] = tmp_r + else: + tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode) + np_q[coord] = tmp_q + np_r[coord] = tmp_r + + x = paddle.to_tensor(a, dtype=dtype) + if mode == "r": + r = paddle.linalg.qr(x, mode=mode) + self.assertTrue(np.allclose(r, np_r, atol=1e-5)) + else: + q, r = paddle.linalg.qr(x, mode=mode) + self.assertTrue(np.allclose(q, np_q, atol=1e-5)) + self.assertTrue(np.allclose(r, np_r, atol=1e-5)) + + tensor_shapes = [ + (3, 5), + (5, 5), + (5, 3), # 2-dim Tensors + (2, 3, 5), + (3, 5, 5), + (4, 5, 3), # 3-dim Tensors + (2, 5, 3, 5), + (3, 5, 5, 5), + (4, 5, 5, 3) # 4-dim Tensors + ] + modes = ["reduced", "complete", "r"] + dtypes = ["float32", "float64"] + for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes, + dtypes): + run_qr_dygraph(tensor_shape, mode, dtype) + + def test_static(self): + paddle.enable_static() + + def run_qr_static(shape, mode, dtype): + if dtype == "float32": + np_dtype = np.float32 + elif dtype == "float64": + np_dtype = np.float64 + a = np.random.rand(*shape).astype(np_dtype) + m = a.shape[-2] + n = a.shape[-1] + min_mn = min(m, n) + if mode == "reduced" or mode == "r": + k = min_mn + else: + k = m + np_q_shape = list(a.shape[:-2]) + np_q_shape.extend([m, k]) + np_r_shape = list(a.shape[:-2]) + np_r_shape.extend([k, n]) + np_q = np.zeros(np_q_shape).astype(np_dtype) + np_r = np.zeros(np_r_shape).astype(np_dtype) + places = [] + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for place in places: + with fluid.program_guard(fluid.Program(), fluid.Program()): + batch_size = a.size // (a.shape[-1] * a.shape[-2]) + for i in range(batch_size): + coord = np.unravel_index(i, a.shape[:-2]) + if mode == "r": + tmp_r = np.linalg.qr(a[coord], mode=mode) + np_r[coord] = tmp_r + else: + tmp_q, tmp_r = np.linalg.qr(a[coord], mode=mode) + np_q[coord] = tmp_q + np_r[coord] = tmp_r + x = paddle.fluid.data( + name="input", shape=shape, dtype=dtype) + if mode == "r": + r = paddle.linalg.qr(x, mode=mode) + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input": a}, + fetch_list=[r]) + self.assertTrue( + np.allclose( + fetches[0], np_r, atol=1e-5)) + else: + q, r = paddle.linalg.qr(x, mode=mode) + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input": a}, + fetch_list=[q, r]) + self.assertTrue( + np.allclose( + fetches[0], np_q, atol=1e-5)) + self.assertTrue( + np.allclose( + fetches[1], np_r, atol=1e-5)) + + tensor_shapes = [ + (3, 5), + (5, 5), + (5, 3), # 2-dim Tensors + (2, 3, 5), + (3, 5, 5), + (4, 5, 3), # 3-dim Tensors + (2, 5, 3, 5), + (3, 5, 5, 5), + (4, 5, 5, 3) # 4-dim Tensors + ] + modes = ["reduced", "complete", "r"] + dtypes = ["float32", "float64"] + for tensor_shape, mode, dtype in itertools.product(tensor_shapes, modes, + dtypes): + run_qr_static(tensor_shape, mode, dtype) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index 726355379e7b6..06b512150cee8 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -23,6 +23,7 @@ from .tensor.linalg import multi_dot # noqa: F401 from .tensor.linalg import matrix_rank from .tensor.linalg import svd +from .tensor.linalg import qr from .tensor.linalg import eigh # noqa: F401 from .tensor.linalg import det from .tensor.linalg import slogdet @@ -38,6 +39,7 @@ 'multi_dot', 'matrix_rank', 'svd', + 'qr', 'matrix_power', 'det', 'slogdet', diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index c8f897c21648f..b898b60fe4712 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -47,6 +47,7 @@ from .linalg import mv # noqa: F401 from .linalg import eig # noqa: F401 from .linalg import matrix_power # noqa: F401 +from .linalg import qr # noqa: F401 from .linalg import eigvals # noqa: F401 from .linalg import multi_dot # noqa: F401 from .linalg import svd # noqa: F401 @@ -237,6 +238,7 @@ 'histogram', 'mv', 'matrix_power', + 'qr', 'eigvals', 'abs', 'acos', diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index f112603fbb60f..6853d904adbf6 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1594,6 +1594,70 @@ def matrix_power(x, n, name=None): return out +def qr(x, mode="reduced", name=None): + r""" + Computes the QR decomposition of one matrix or batches of matrice (backward is unsupported now). + + Args: + x (Tensor): The input tensor. Its shape should be `[..., M, N]`, + where ... is zero or more batch dimensions. M and N can be arbitrary + positive number. The data type of x should be float32 or float64. + mode (str, optional): A flag to control the behavior of qr, the default is "reduced". + Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`: + If mode = "reduced", qr op will return reduced Q and R matrices, + which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`. + If mode = "complete", qr op will return complete Q and R matrices, + which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`. + If mode = "r", qr op will only return reduced R matrix, which means + R's shape is `[..., K, N]`. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R. + If mode = "r", qr will return a tensor which represents R. + + Examples: + .. code-block:: python + + import paddle + + x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64') + q, r = paddle.linalg.qr(x) + print (q) + print (r) + + # Q = [[-0.16903085, 0.89708523], + # [-0.50709255, 0.27602622], + # [-0.84515425, -0.34503278]]) + + # R = [[-5.91607978, -7.43735744], + # [ 0. , 0.82807867]]) + + # one can verify : X = Q * R ; + """ + if in_dygraph_mode(): + q, r = _C_ops.qr(x, 'mode', mode) + if mode == "r": + return r + else: + return q, r + check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'qr') + check_type(mode, 'mode', str, 'qr') + helper = LayerHelper('qr', **locals()) + q = helper.create_variable_for_type_inference(dtype=x.dtype) + r = helper.create_variable_for_type_inference(dtype=x.dtype) + attrs = dict() + attrs['mode'] = mode + helper.append_op( + type='qr', inputs={'X': [x]}, outputs={'Q': q, + 'R': r}, attrs=attrs) + if mode == "r": + return r + else: + return q, r + + def eig(x, name=None): """ This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices. @@ -1674,7 +1738,7 @@ def eigvals(x, name=None): Its data type should be float32, float64, complex64, or complex128. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - + Returns: Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`. The eigenvalues are complex-valued even when `x` is real. From 7edcc4fbbe3f90aecba0cc0197c1f89d2368a17b Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 19 Oct 2021 14:45:01 +0800 Subject: [PATCH 024/116] catch the generatorfunction and intercept it. (#35369) * catch the generatorfunction and intercept it. * add test generator * add test case * refine the testcase --- .../dygraph_to_static/convert_call_func.py | 11 +++++ .../test_convert_call_generator.py | 49 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py index b62c16989fbe7..300586969ff65 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py @@ -167,6 +167,17 @@ def dyfunc(x): if is_builtin(func) or is_unsupported(func): return func + if inspect.isgeneratorfunction(func): + # NOTE(xiongkun03): inspect.isfunction() will return True even though func is a generator function. + # If we don't deal generatorfunction here, we will regard it as normal function and get errors in some + # occasion. + number_of_stars = 30 + translator_logger.warn( + "\n\n" + "*" * number_of_stars + + "\nYour function:`{}` doesn't support to transform to static function because it is a generator function, it will be run as-is." + .format(func.__name__) + "\n" + "*" * number_of_stars + "\n\n") + return func + if inspect.isfunction(func): # TODO(liym27): If func is a lambda function, special conversion is needed. if func.__name__ == '': diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py new file mode 100644 index 0000000000000..cfe9e191ed486 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call_generator.py @@ -0,0 +1,49 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import logging +import numpy as np + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph import ProgramTranslator +from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import CONVERSION_OPTIONS +from test_program_translator import get_source_code +from paddle.jit import to_static + + +def dyfunc_generator(): + for i in range(100): + yield paddle.to_tensor([i] * 10) + + +def main_func(): + """ Error will raise, but we only report a warning not intercept + """ + for i in dyfunc_generator(): + print(i) + + +class TestConvertGenerator(unittest.TestCase): + def test_raise_error(self): + with self.assertRaises(Exception): + to_static(main_func)() + + +if __name__ == '__main__': + unittest.main() From d89a759bba8dacd2da2a27e8142e4b37bbfd3954 Mon Sep 17 00:00:00 2001 From: littletomatodonkey Date: Tue, 19 Oct 2021 14:57:23 +0800 Subject: [PATCH 025/116] fix replicate pad when input size is 0 (#36510) * fix replicate pad when input size is 0 * add unit test --- paddle/fluid/operators/pad3d_op.cc | 12 +++++------- paddle/fluid/operators/pad3d_op.cu | 12 +++++------- python/paddle/fluid/tests/unittests/test_pad3d_op.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index c2be9ac97ff89..e84b5a9d9baae 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -565,13 +565,11 @@ class Pad3dCPUKernel : public framework::OpKernel { " in reflect mode" ", but received depth(%d) and pad_right(%d).", in_width, pads[1])); - } - - if (mode == "circular") { - PADDLE_ENFORCE_NE( - in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular padding mode.")); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); } const int pad_left = pads[0]; diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu index ed936c10755f0..f243a78e5578b 100644 --- a/paddle/fluid/operators/pad3d_op.cu +++ b/paddle/fluid/operators/pad3d_op.cu @@ -618,13 +618,11 @@ class Pad3dCUDAKernel : public framework::OpKernel { " in reflect mode" ", but received depth(%d) and pad_right(%d).", in_width, pads[1])); - } - - if (mode == "circular") { - PADDLE_ENFORCE_NE( - in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular padding mode.")); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); } const int pad_left = pads[0]; diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py index 5ec7bdc66fe49..7abc314bc1ba0 100644 --- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py @@ -732,6 +732,15 @@ def test_circular_1(): mode='circular', data_format="NCDHW") + def test_replicate_1(): + input_shape = (1, 2, 0, 4, 5) + data = np.random.rand(*input_shape).astype(np.float32) + x = paddle.to_tensor(data) + y = F.pad(x, + pad=[1, 1, 1, 1, 2, 3], + mode='replicate', + data_format="NCDHW") + paddle.disable_static() for place in self.places: self.assertRaises(ValueError, test_variable) @@ -739,6 +748,7 @@ def test_circular_1(): self.assertRaises(Exception, test_reflect_2) self.assertRaises(Exception, test_reflect_3) self.assertRaises(Exception, test_circular_1) + self.assertRaises(Exception, test_replicate_1) paddle.enable_static() From 8cc8e411121649be36af8396536502e7ef7539b7 Mon Sep 17 00:00:00 2001 From: WangXi Date: Tue, 19 Oct 2021 14:59:38 +0800 Subject: [PATCH 026/116] [hybrid] static model parallel dropout support deterministic RandomSeedGenerator (#36228) --- paddle/fluid/framework/generator.cc | 37 +++++ paddle/fluid/framework/generator.h | 6 + paddle/fluid/operators/dropout_impl_util.h | 10 +- paddle/fluid/operators/seed_op.cc | 11 ++ paddle/fluid/operators/seed_op.cu | 11 +- paddle/fluid/operators/seed_op.h | 34 +++-- paddle/fluid/pybind/generator_py.cc | 2 + .../meta_parallel/parallel_layers/random.py | 137 ++++++++++++++++++ python/paddle/fluid/backward.py | 6 +- .../fluid/tests/unittests/test_dropout_op.py | 44 ++++++ .../fluid/tests/unittests/test_optimizer.py | 48 +++++- .../fluid/tests/unittests/test_seed_op.py | 32 +++- python/paddle/framework/random.py | 8 + 13 files changed, 354 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc index 4b64722a7abf5..154154fc79517 100644 --- a/paddle/fluid/framework/generator.cc +++ b/paddle/fluid/framework/generator.cc @@ -63,6 +63,43 @@ const std::shared_ptr& DefaultCPUGenerator() { return default_cpu_generator; } +using RNGMap = std::unordered_map>; + +static RNGMap& GetRandomSeedGeneratorMap() { + static auto random_seed_generator_map = RNGMap(); + return random_seed_generator_map; +} + +const std::shared_ptr& SetRandomSeedGenerator( + const std::string& name, uint64_t seed) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter == rng_map.end(), true, + platform::errors::AlreadyExists( + "%s RandomSeedGenerator is already exist", name)); + + auto generator = std::make_shared(seed); + bool emplace_success = rng_map.emplace(name, generator).second; + PADDLE_ENFORCE_EQ( + emplace_success, true, + platform::errors::PermissionDenied( + "SetRandomSeedGenerator cannot emplace %s RandomSeedGenerator", + name)); + return rng_map[name]; +} + +const std::shared_ptr& GetRandomSeedGenerator( + const std::string& name) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter != rng_map.end(), true, + platform::errors::NotFound( + "%s RandomSeedGenerator is not found, please " + "use `set_random_seed_generator` to set rng first", + name)); + return iter->second; +} + std::shared_ptr OpDefaultCPUEngine() { static auto op_default_cpu_engine = std::make_shared(); return op_default_cpu_engine; diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h index 862e63c4c6af5..d0a5b4443e3f4 100644 --- a/paddle/fluid/framework/generator.h +++ b/paddle/fluid/framework/generator.h @@ -126,5 +126,11 @@ std::shared_ptr GetCPURandomEngine(uint64_t); const std::shared_ptr& GetDefaultCUDAGenerator( int64_t device_id = -1); +const std::shared_ptr& SetRandomSeedGenerator( + const std::string& name, uint64_t seed); + +const std::shared_ptr& GetRandomSeedGenerator( + const std::string& name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index a7188efe7139c..f2038d12528c4 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -29,7 +29,7 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - if ((seed) && platform::is_gpu_place(seed->place())) { + if (seed) { framework::Tensor seed_cpu_tensor; TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); *seed_data = static_cast(seed_cpu_tensor.data()[0]); @@ -39,12 +39,8 @@ inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, *seed_data = seed_offset.first; *increment = seed_offset.second; } else { - if (seed) { - *seed_data = *(seed->data()); - } else { - std::random_device rnd; - *seed_data = is_fix_seed ? seed_val : rnd(); - } + std::random_device rnd; + *seed_data = is_fix_seed ? seed_val : rnd(); *increment = offset; } } diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 32daa8c3934ae..837ccae0284f5 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -39,6 +39,17 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddOutput("Out", "The output of seed op."); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr("deterministic", + "(bool, default false) Whether to use deterministic " + "RandomSeedGenerator which " + "generate by `set_random_seed_generator`") + .SetDefault(false) + .AsExtra(); + AddAttr( + "rng_name", + "use deterministic RandomSeedGenerator which name is `rng_name`") + .SetDefault("") + .AsExtra(); AddAttr("force_cpu", "(bool, default false) Force fill output variable to cpu " "memory. Otherwise, fill output variable to the running " diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index 4593b88019621..4ca75bcf76e51 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -23,16 +23,9 @@ class GPUSeedKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *out = context.Output("Out"); - int user_seed = context.Attr("seed"); - auto force_cpu = context.Attr("force_cpu"); - std::random_device rnd; - int seed; - if (user_seed != 0) { - seed = user_seed; - } else { - seed = rnd(); - } + int seed = get_seed(context); + auto force_cpu = context.Attr("force_cpu"); bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace(); if (cpu_place) { platform::DeviceContextPool &pool = diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h index 671f397d4eaff..202f25e0b4cd1 100644 --- a/paddle/fluid/operators/seed_op.h +++ b/paddle/fluid/operators/seed_op.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -20,24 +21,37 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -class CPUSeedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - int user_seed = context.Attr("seed"); +static int get_seed(const framework::ExecutionContext& context) { + int user_seed = context.Attr("seed"); + bool deterministic = context.Attr("deterministic"); + int seed = 0; + if (!deterministic) { // NOTE: fixed seed should only be used in unittest or for debug. // Guarantee to use random seed in training. - std::random_device rnd; - int seed; if (user_seed != 0) { seed = user_seed; } else { + std::random_device rnd; seed = rnd(); } - out_data[0] = seed; + } else { + std::string name = context.Attr("rng_name"); + auto rng = framework::GetRandomSeedGenerator(name); + do { // NOTE(wangxi): cpu dropout will use random seed if seed == 0 + seed = static_cast(rng->Random64()); + } while (seed == 0); + } + return seed; +} + +template +class CPUSeedKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + out_data[0] = get_seed(context); } }; diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc index 67121e24089f7..fa924ce658125 100644 --- a/paddle/fluid/pybind/generator_py.cc +++ b/paddle/fluid/pybind/generator_py.cc @@ -60,6 +60,8 @@ void BindGenerator(py::module* m_ptr) { &framework::Generator::SetIsInitPy); m.def("default_cpu_generator", &framework::DefaultCPUGenerator); m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator); + m.def("set_random_seed_generator", &framework::SetRandomSeedGenerator); + m.def("get_random_seed_generator", &framework::GetRandomSeedGenerator); } } // namespace pybind } // namespace paddle diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py index ec80ba71036c0..0a96745c2a4a1 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py @@ -15,6 +15,11 @@ import paddle import contextlib import numpy as np +from paddle import _C_ops +from paddle.fluid import core +from paddle.fluid.data_feeder import check_variable_and_dtype +from paddle.fluid.framework import in_dygraph_mode, default_main_program +from paddle.fluid.layer_helper import LayerHelper __all__ = [] @@ -93,3 +98,135 @@ def model_parallel_random_seed(seed=None): RNG_STATE_TRACKER.reset() RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed) paddle.seed(global_seed) + + +def determinate_seed(rng_name): + assert rng_name is not None and rng_name != "" + helper = LayerHelper('seed', **locals()) + out = helper.create_variable_for_type_inference(dtype=paddle.int32) + # set force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang + helper.append_op( + type='seed', + outputs={'Out': out}, + attrs={'deterministic': True, + 'rng_name': rng_name, + 'force_cpu': True}) + return out + + +def dropout(x, + p=0.5, + axis=None, + rng_name=None, + training=True, + mode="upscale_in_train", + name=None): + """ + Dropout is a regularization technique for reducing overfitting by preventing + neuron co-adaption during training. The dropout operator randomly sets the + outputs of some units to zero, while upscale others according to the given + dropout probability. + + Args: + x (Tensor): The input tensor. The data type is float32 or float64. + p (float|int): Probability of setting units to zero. Default 0.5. + axis (int|list|tuple): The axis along which the dropout is performed. Default None. + rng_name (str): The random seed generator name, which used to obtain deterministic results. + training (bool): A flag indicating whether it is in train phrase or not. Default True. + mode(str): ['upscale_in_train'(default) | 'downscale_in_infer']. + + 1. upscale_in_train(default), upscale the output at training time + + - train: out = input * mask / ( 1.0 - dropout_prob ) + - inference: out = input + + 2. downscale_in_infer, downscale the output at inference + + - train: out = input * mask + - inference: out = input * (1.0 - dropout_prob) + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Tensor representing the dropout, has same shape and data type as `x` . + + + Examples: + We use ``p=0.5`` in the following description for simplicity. + + 1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly. + + .. code-block:: text + + Let's see a simple case when x is a 2d tensor with shape 2*3: + [[1 2 3] + [4 5 6]] + we generate mask with the same shape as x, which is 2*3. The value of mask is + sampled from a Bernoulli distribution randomly. For example, we may get such mask: + [[0 1 0] + [1 0 1]] + So the output is obtained from elementwise multiply of x and mask: + [[0 2 0] + [4 0 6]] + Using default setting, i.e. ``mode='upscale_in_train'`` , + if in training phase, the final upscale output is: + [[0 4 0 ] + [8 0 12]] + if in test phase, the output is the same as input: + [[1 2 3] + [4 5 6]] + we can also set ``mode='downscale_in_infer'`` , then + if in training phase, the final output is: + [[0 2 0] + [4 0 6]] + if in test phase, the scale output is: + [[0.5 1. 1.5] + [2. 2.5 3. ]] + + """ + if rng_name is None: + return paddle.nn.functional.dropout(x, p, axis, training, mode, name) + + # fast return for p == 0 + if p == 0: return x + + assert isinstance(p, (float, int)), \ + TypeError("p argument should be a number") + assert 0 <= p <= 1, ValueError("p argument should between 0 and 1") + assert mode in ('downscale_in_infer', 'upscale_in_train'), \ + ValueError( + "mode argument should be 'downscale_in_infer' or 'upscale_in_train'") + + assert axis is None, \ + TypeError("unsupport axis when using random seed generator") + + mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer + + # dygraph using tracker, doesn't need determinate seed + if in_dygraph_mode(): + out, mask = _C_ops.dropout(x, 'dropout_prob', p, 'is_test', + not training, 'fix_seed', False, 'seed', 0, + 'dropout_implementation', mode) + return out + + seed = determinate_seed(rng_name) + + helper = LayerHelper('dropout', **locals()) + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], + 'dropout') + + out = helper.create_variable_for_type_inference(dtype=x.dtype) + mask = helper.create_variable_for_type_inference( + dtype=core.VarDesc.VarType.UINT8, stop_gradient=True) + + helper.append_op( + type='dropout', + inputs={'X': [x], + 'Seed': seed}, + outputs={'Out': [out], + 'Mask': [mask]}, + attrs={ + 'dropout_prob': p, + 'is_test': not training, + 'dropout_implementation': mode, + }) + return out diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 7ab060be6df29..d62f7b5941126 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -175,11 +175,15 @@ def modify_forward_desc_for_recompute(self): return op_idx = 0 - while (op_idx < len(self.ops)): + while op_idx < len(self.ops): op = self.ops[op_idx] if op.desc.type() != "dropout": op_idx += 1 continue + # already insert seed op before dropout + if op.input('Seed') is not None and len(op.input('Seed')) == 1: + op_idx += 1 + continue # add a seed op so that the two dropout op can generate same output op_unique_name = unique_name.generate("seed") var_unique_name = unique_name.generate_with_ignorable_key(".".join( diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 396d55b3d0a8b..bf10e07ba0d6f 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -19,6 +19,7 @@ import paddle.fluid.core as core from op_test import OpTest, skip_check_grad_ci import paddle +import paddle.static as static import paddle.fluid as fluid from paddle.fluid import Program, program_guard @@ -856,5 +857,48 @@ def test_dygraph(self): self.assertTrue(np.allclose(result.numpy(), result_np)) +class TestDropoutWithDeterminateSeedGenerator(unittest.TestCase): + def setUp(self): + paddle.framework.random.set_random_seed_generator('seed0', 123) + paddle.framework.random.set_random_seed_generator('seed1', 123) + rng0 = paddle.framework.random.get_random_seed_generator('seed0') + rng1 = paddle.framework.random.get_random_seed_generator('seed1') + self.places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def check_static_result(self, place): + from paddle.distributed.fleet.meta_parallel.parallel_layers.random import dropout + with static.program_guard(static.Program(), static.Program()): + input = static.data(name="input", shape=[40, 40], dtype="float32") + res1 = dropout( + input, + p=0.3, + training=True, + mode='upscale_in_train', + rng_name='seed0') + res2 = dropout( + input, + p=0.3, + training=True, + mode='upscale_in_train', + rng_name='seed1') + res3 = dropout(input, p=0.3) + + in_np = np.random.random([40, 40]).astype("float32") + + exe = static.Executor(place) + res_list = [res1, res2] + for i in range(2): + out1, out2 = exe.run(static.default_main_program(), + feed={"input": in_np}, + fetch_list=res_list) + self.assertTrue(np.allclose(out1, out2)) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 31704ebcd9192..89c7be18a7dfa 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -619,7 +619,7 @@ def test_lookahead_optimizer(self): class TestRecomputeOptimizer(unittest.TestCase): - def net(self, return_input=False, with_dropout=False): + def net(self, return_input=False, with_dropout=False, with_seed=False): program = framework.Program() block = program.global_block() mul_x = block.create_parameter( @@ -628,7 +628,8 @@ def net(self, return_input=False, with_dropout=False): dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") mul_out = block.create_var( dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") - if with_dropout == True: + + if with_dropout is True: mul_out_drop = block.create_var( dtype="float32", shape=[5, 8], @@ -636,6 +637,10 @@ def net(self, return_input=False, with_dropout=False): name="mul.out.dropout") mul_out_mask = block.create_var( dtype="uint8", shape=[5, 8], lod_level=0, name="mul.out.mask") + if with_seed is True: + seed_out = block.create_var( + dtype="int32", shape=[1], name="seed.out") + b1 = block.create_parameter( dtype="float32", shape=[5, 8], lod_level=0, name="b1") b1_out = block.create_var( @@ -652,10 +657,23 @@ def net(self, return_input=False, with_dropout=False): "Y": mul_y}, outputs={"Out": mul_out}, attrs={"x_num_col_dims": 1}) - if with_dropout == True: + + if with_dropout is True: + dropout_inputs = {'X': [mul_out]} + if with_seed is True: + block.append_op( + type='seed', + outputs={'Out': seed_out}, + attrs={ + 'deterministic': True, + 'rng_name': 'rng0', + 'force_cpu': True + }) + dropout_inputs = {'X': [mul_out], 'Seed': [seed_out]} + block.append_op( type='dropout', - inputs={'X': [mul_out]}, + inputs=dropout_inputs, outputs={'Out': [mul_out_drop], 'Mask': [mul_out_mask]}, attrs={'dropout_prob': 0.5, }) @@ -670,6 +688,7 @@ def net(self, return_input=False, with_dropout=False): inputs={"X": mul_out, "Y": b1}, outputs={"Out": b1_out}) + block.append_op( type="elementwise_add", inputs={"X": b1_out, @@ -864,6 +883,27 @@ def test_dropout(self): "sgd", "sgd", "sgd" ]) + def test_dropout_with_determinate_seed(self): + mul_out, b1_out, b2_out, mean_out = self.net(with_dropout=True, + with_seed=True) + self.assertEqual(len(mean_out.block.ops), 6) + self.assertEqual([op.type for op in mean_out.block.ops], [ + "mul", "seed", "dropout", "elementwise_add", "elementwise_add", + "mean" + ]) + sgd_optimizer = optimizer.SGD(learning_rate=1.0) + recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + recompute_optimizer._set_checkpoints([b1_out]) + opts, params_grads = recompute_optimizer.minimize(mean_out) + + self.assertEqual(len(mean_out.block.ops), 17) + self.assertEqual([op.type for op in mean_out.block.ops], [ + "mul", "seed", "dropout", "elementwise_add", "elementwise_add", + "mean", "fill_constant", "mean_grad", "elementwise_add_grad", "mul", + "dropout", "elementwise_add_grad", "dropout_grad", "mul_grad", + "sgd", "sgd", "sgd" + ]) + def test_dropout_with_seed(self): """ when we recompute a dropout op, make sure that the recomputed one diff --git a/python/paddle/fluid/tests/unittests/test_seed_op.py b/python/paddle/fluid/tests/unittests/test_seed_op.py index 08478d7140d43..0dcc197ece7ed 100644 --- a/python/paddle/fluid/tests/unittests/test_seed_op.py +++ b/python/paddle/fluid/tests/unittests/test_seed_op.py @@ -17,7 +17,10 @@ import unittest import numpy as np from op_test import OpTest -import paddle.fluid as fluid +import paddle +import paddle.static as static + +paddle.enable_static() class TestSeedOpFixSeed(OpTest): @@ -42,5 +45,32 @@ def test_check_output(self): self.check_output(no_check_set=["Out"]) +class TestDropoutWithRandomSeedGenerator(unittest.TestCase): + def setUp(self): + paddle.framework.random.set_random_seed_generator('seed0', 123) + paddle.framework.random.set_random_seed_generator('seed1', 123) + self.rng0 = paddle.framework.random.get_random_seed_generator('seed0') + self.rng1 = paddle.framework.random.get_random_seed_generator('seed1') + self.places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self.places.append(paddle.CUDAPlace(0)) + + def check_static_result(self, place): + import paddle.distributed.fleet.meta_parallel.parallel_layers.random as random + with static.program_guard(static.Program(), static.Program()): + res1 = random.determinate_seed('seed0') + + exe = static.Executor(place) + res_list = [res1] + for i in range(2): + out1, = exe.run(static.default_main_program(), + fetch_list=res_list) + self.assertEqual(out1, np.cast['int32'](self.rng1.random())) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py index 701f8b5352c3d..a560072cf5a7b 100644 --- a/python/paddle/framework/random.py +++ b/python/paddle/framework/random.py @@ -122,3 +122,11 @@ def _manual_program_seed(seed): fluid.default_startup_program().random_seed = seed program = fluid.Program() program.global_seed(seed) + + +def set_random_seed_generator(name, seed): + core.set_random_seed_generator(name, seed) + + +def get_random_seed_generator(name): + return core.get_random_seed_generator(name) From 7b67f398c33e03930aea8cfb0d330c2c28757100 Mon Sep 17 00:00:00 2001 From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com> Date: Tue, 19 Oct 2021 15:06:48 +0800 Subject: [PATCH 027/116] add nearest_interp_v2 trt plugin (#34126) * add nearest_interp_v2 trt plugin --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/nearest_interp_v2_op.cc | 108 +++++++++++++ .../convert/test_nearest_interp_v2_op.cc | 54 +++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 30 +++- .../tests/infer_ut/test_det_mv3_db.cc | 41 +---- .../unittests/ir/inference/CMakeLists.txt | 1 + .../test_trt_convert_nearest_interp_v2.py | 101 ++++++++++++ .../test_trt_nearest_interp_v2_op.py | 151 ++++++++++++++++++ 9 files changed, 450 insertions(+), 38 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3136e53e74d09..dfa27037205f1 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1403,6 +1403,7 @@ USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); +USE_TRT_CONVERTER(nearest_interp_v2); USE_TRT_CONVERTER(reshape); USE_TRT_CONVERTER(reduce_sum); USE_TRT_CONVERTER(gather_nd); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index f2c7a4b62bbbb..ef12cb6b36617 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -18,6 +18,7 @@ nv_library(tensorrt_converter tile_op.cc conv3d_op.cc mish_op.cc + nearest_interp_v2_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc new file mode 100644 index 0000000000000..f2e0e0c09c5ef --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class NearestInterpolateV2OpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid nearest_interp_v2 op"; + + framework::OpDesc op_desc(op, nullptr); + + std::string input_name = op_desc.Input("X").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input = engine_->GetITensor(input_name); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + auto interp_method = + BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method")); + bool align_corners = + BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners")); + + auto input_names = op_desc.Input("X"); + auto scale = BOOST_GET_CONST(std::vector, op_desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w")); + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); + layer->setAlignCorners(align_corners); + + auto in_dim = input->getDimensions(); + + float scale_h = 1.f; + float scale_w = 1.f; + + std::vector scales; + + if (out_h > 0 && out_w > 0) { + // axis are different in static/dynamic mode + bool with_dynamic = engine_->with_dynamic_shape(); + + int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic; + int w_axis = + (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic; + + scale_h = + static_cast(out_h) / static_cast(in_dim.d[h_axis]); + scale_w = + static_cast(out_w) / static_cast(in_dim.d[w_axis]); + } else { + scale_h = scale[0]; + scale_w = scale[1]; + } + + if (engine_->with_dynamic_shape()) { + scales.push_back(1.f); + } + + if (data_layout == framework::DataLayout::kNCHW) { + scales.push_back(1.f); + scales.push_back(scale_h); + scales.push_back(scale_w); + } else if (data_layout == framework::DataLayout::kNHWC) { + // NHWC + scales.push_back(scale_h); + scales.push_back(scale_w); + scales.push_back(1.f); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Data layout must be NCHW or NHWC.")); + } + layer->setScales(scales.data(), scales.size()); + + RreplenishLayerAndOutput(layer, "nearest_interp_v2", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(nearest_interp_v2, NearestInterpolateV2OpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc new file mode 100644 index 0000000000000..f5ab6a9924931 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(nearest_interp_v2_op, test_swish) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("interp-X", nvinfer1::Dims3(3, 32, 32)); + validator.DeclOutputVar("interp-Out", nvinfer1::Dims3(3, 64, 64)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("nearest_interp_v2"); + desc.SetInput("X", {"interp-X"}); + desc.SetOutput("Out", {"interp-Out"}); + + std::vector scale({2.f, 2.f}); + + desc.SetAttr("data_layout", "NCHW"); + desc.SetAttr("interp_method", "nearest"); + desc.SetAttr("align_corners", false); + desc.SetAttr("scale", scale); + desc.SetAttr("out_h", 0); + desc.SetAttr("out_w", 0); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(nearest_interp_v2); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 89159c0bb636c..e7318d07611ea 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -141,7 +141,8 @@ struct SimpleOpTypeSetTeller : public Teller { "reduce_mean", "conv3d", "conv3d_transpose", - "mish"}; + "mish", + "nearest_interp_v2"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, @@ -599,6 +600,33 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "nearest_interp_v2") { + std::vector attrs{"data_layout", "interp_method", + "align_corners", "scale", + "out_h", "out_w"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW && + data_layout != framework::DataLayout::kNHWC) + return false; + auto interp_method = + BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); + if (interp_method != "nearest") return false; + auto scale = BOOST_GET_CONST(std::vector, desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w")); + if (!(out_h > 0 && out_w > 0)) { + if (scale[0] <= 0.f || scale[1] <= 0.f) { + VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is " + "not set."; + return false; + } + } + } + if (op_type == "roi_align") { if (!with_dynamic_shape) return false; diff --git a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc index 67c2eeb0be5f9..cf3398b49ee9b 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc @@ -35,44 +35,11 @@ paddle::test::Record PrepareInput(int batch_size, int image_shape = 640) { void PrepareDynamicShape(paddle_infer::Config* config, int max_batch_size = 4) { // set dynamic shape range std::map> min_input_shape = { - {"x", {1, 3, 50, 50}}, - {"conv2d_92.tmp_0", {1, 120, 20, 20}}, - {"conv2d_91.tmp_0", {1, 24, 10, 10}}, - {"conv2d_59.tmp_0", {1, 96, 20, 20}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}}, - {"conv2d_124.tmp_0", {1, 256, 20, 20}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}}, - {"elementwise_add_7", {1, 56, 2, 2}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}}; + {"x", {1, 3, 50, 50}}}; std::map> max_input_shape = { - {"x", {max_batch_size, 3, 2000, 2000}}, - {"conv2d_92.tmp_0", {max_batch_size, 120, 400, 400}}, - {"conv2d_91.tmp_0", {max_batch_size, 24, 200, 200}}, - {"conv2d_59.tmp_0", {max_batch_size, 96, 400, 400}}, - {"nearest_interp_v2_1.tmp_0", {max_batch_size, 256, 200, 200}}, - {"nearest_interp_v2_2.tmp_0", {max_batch_size, 256, 400, 400}}, - {"conv2d_124.tmp_0", {max_batch_size, 256, 400, 400}}, - {"nearest_interp_v2_3.tmp_0", {max_batch_size, 64, 400, 400}}, - {"nearest_interp_v2_4.tmp_0", {max_batch_size, 64, 400, 400}}, - {"nearest_interp_v2_5.tmp_0", {max_batch_size, 64, 400, 400}}, - {"elementwise_add_7", {max_batch_size, 56, 400, 400}}, - {"nearest_interp_v2_0.tmp_0", {max_batch_size, 256, 400, 400}}}; + {"x", {max_batch_size, 3, 1600, 1600}}}; std::map> opt_input_shape = { - {"x", {1, 3, 640, 640}}, - {"conv2d_92.tmp_0", {1, 120, 160, 160}}, - {"conv2d_91.tmp_0", {1, 24, 80, 80}}, - {"conv2d_59.tmp_0", {1, 96, 160, 160}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}}, - {"conv2d_124.tmp_0", {1, 256, 160, 160}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}}, - {"elementwise_add_7", {1, 56, 40, 40}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}}; + {"x", {1, 3, 640, 640}}}; config->SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); } @@ -123,7 +90,7 @@ TEST(tensorrt_tester_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { FLAGS_modeldir + "/inference.pdiparams"); config.EnableUseGpu(100, 0); config.EnableTensorRtEngine( - 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, true, false); + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); PrepareDynamicShape(&config, 4); // get groudtruth by disbale ir paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 54229533935a4..b951afdfad5ea 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -68,4 +68,5 @@ set_tests_properties(test_trt_conv_quant_dequant_pass PROPERTIES TIMEOUT 100) set_tests_properties(test_trt_matmul_quant_dequant PROPERTIES TIMEOUT 100) set_tests_properties(test_trt_conv3d_op PROPERTIES TIMEOUT 60) set_tests_properties(test_trt_conv3d_transpose_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_trt_nearest_interp_v2_op PROPERTIES TIMEOUT 30) endif() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py new file mode 100644 index 0000000000000..0c7715c957085 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_nearest_interp_v2.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertNearestInterpV2Test(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input(): + return np.ones([1, 3, 32, 32]).astype(np.float32) + + ops_config = [{ + "op_type": "nearest_interp_v2", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["interp_output_data"] + }, + "op_attrs": { + "data_layout": "NCHW", + "interp_method": "nearest", + "align_corners": False, + "scale": [2., 2.], + "out_h": 0, + "out_w": 0 + } + }] + + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={"input_data": TensorConfig(data_gen=generate_input)}, + outputs=["interp_output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-2 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-2 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py new file mode 100644 index 0000000000000..101ace6cd54a8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_v2_op.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid.core as core +from paddle import fluid +import paddle.nn.functional as F +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTNearestInterpTest(InferencePassTest): + def setUp(self): + self.set_params() + + with fluid.program_guard(self.main_program, self.startup_program): + if self.data_layout == 'NCHW': + shape = [ + -1, self.channels, self.origin_shape[0], + self.origin_shape[1] + ] + else: + shape = [ + -1, self.origin_shape[0], self.origin_shape[1], + self.channels + ] + data = fluid.data(name='data', shape=shape, dtype='float32') + resize_out = self.append_nearest_interp(data) + out = fluid.layers.batch_norm(resize_out, is_test=True) + + if self.data_layout == 'NCHW': + shape = [ + self.bs, self.channels, self.origin_shape[0], + self.origin_shape[1] + ] + else: + shape = [ + self.bs, self.origin_shape[0], self.origin_shape[1], + self.channels + ] + + self.feeds = {'data': np.random.random(shape).astype('float32'), } + self.enable_trt = True + self.trt_parameters = TRTNearestInterpTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + def append_nearest_interp(self, data): + if self.scale > 0.: + return F.interpolate( + data, + scale_factor=self.scale, + align_corners=self.align_corners, + mode='nearest', + data_format=self.data_layout) + return F.interpolate( + data, + size=self.resize_shape, + align_corners=self.align_corners, + mode='nearest', + data_format=self.data_layout) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + +class TRTNearestInterpTest1(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = 2. + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest2(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (47, 48) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest3(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest4(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = 2. + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest5(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (47, 48) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +if __name__ == "__main__": + unittest.main() From 6cdc5a4ba16f11a09e8a723204b02de1f16c51c3 Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Tue, 19 Oct 2021 15:24:38 +0800 Subject: [PATCH 028/116] Optimize the subgraph generated by BuildCinnPass (#36503) * add feed op and new var for the generated subgraph * perfect the test script of build_cinn_pass * remove useless clear and perfect some annotation --- .../framework/paddle2cinn/build_cinn_pass.cc | 129 ++++++++++++++++-- .../paddle2cinn/build_cinn_pass_test.cc | 98 +++++++++++-- 2 files changed, 198 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index ffdbb46bd7c06..caddc8fbb7381 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -64,10 +64,81 @@ using framework::ir::Node; using GraphNodeVec = std::vector; using GraphNodeSet = std::unordered_set; +// Deal with subgraph's feed input var node: +// create a new input var node and it's feed op node +void AddFeedOpAndVar(const std::unordered_set& feed_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : feed_vars) { + // create feed op + OpDesc desc; + desc.SetType("feed"); + desc.SetOutput("Out", {old_var->Name()}); + auto op = graph->CreateOpNode(&desc); + + // create new feed var node (SSAGraph) + auto var = graph->CreateVarNode(old_var->Var()); + + // link feed op and feed var + op->outputs = {var}; + var->inputs = {op}; + + // link feed var to cluster op + for (auto* old_op : old_var->outputs) { + if (cluster.count(old_op)) { + var->outputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->inputs.emplace_back(var); + } + // Do not need relink old op or old var here, they will be + // fixed in RemoveLinkFromCluster, here we just deal with + // new subgraph's node. + } + } +} + +// Deal with subgraph's parameter var node: +// create a new input var node, it's data will get by scope, +// so it don't need feed op +void AddParamVar(const std::unordered_set& param_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : param_vars) { + auto var = graph->CreateVarNode(old_var->Var()); + + for (auto* old_op : old_var->outputs) { + if (cluster.count(old_op)) { + var->outputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->inputs.emplace_back(var); + } + } + } +} + +// Deal with subgraph's outputs var node: +// create a new output var node and it's fetch op +void AddOutputVar(const std::unordered_set& output_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : output_vars) { + auto var = graph->CreateVarNode(old_var->Var()); + + for (auto* old_op : old_var->inputs) { + if (cluster.count(old_op)) { + var->inputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->outputs.emplace_back(var); + } + } + } +} + // Create new subgraph with and op nodes are cluster nodes, and all // var node are from internal nodes -std::unique_ptr CreateNewSubGraph( - const GraphNodeSet& cluster, const GraphNodeSet& cluster_internals) { +std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_internals, + const GraphNodeSet& cluster_inputs) { // Graph's constructor must has one parameter, and in our code, // the ProgramDesc is useless, so here we pass a temporary object. auto sub_graph = std::make_unique(framework::ProgramDesc()); @@ -84,6 +155,8 @@ std::unique_ptr CreateNewSubGraph( old_var2new_var[var] = sub_node; } + std::unordered_set need_feed_vars; + std::unordered_set param_vars, output_vars; // the subgraph is independently, so here we only need link // to the node in new subgraph, and discard the link to // out-graph. @@ -91,15 +164,36 @@ std::unique_ptr CreateNewSubGraph( for (auto* var : op->inputs) { if (cluster_internals.count(var)) { old_op2new_op[op]->inputs.emplace_back(old_var2new_var[var]); + } else if (cluster_inputs.count(var)) { + if (var->Var()->IsParameter()) { + // Parameters have been preserved in scope, compared to feed var, + // param just need add new var and don't need add feed op. + // The var is used for check whether we need preserve the tensor + // when transform paddle scope to CINN scope. + param_vars.insert(var); + } else { + // When the var is subgraph input and the var is not parameter, + // we need add a new feed op to feed the var. + need_feed_vars.insert(var); + } } } for (auto* var : op->outputs) { if (cluster_internals.count(var)) { old_op2new_op[op]->outputs.emplace_back(old_var2new_var[var]); + } else { + // Create new output var node to guarantee the independency of + // subgraph. In other words, the subgraph has no connection with + // other graph, even the input graph. + output_vars.insert(var); } } } + AddFeedOpAndVar(need_feed_vars, cluster, old_op2new_op, sub_graph.get()); + AddParamVar(param_vars, cluster, old_op2new_op, sub_graph.get()); + AddOutputVar(output_vars, cluster, old_op2new_op, sub_graph.get()); + for (auto* var : cluster_internals) { for (auto* op : var->inputs) { if (cluster.count(op)) { @@ -118,10 +212,12 @@ std::unique_ptr CreateNewSubGraph( // This interface is used to classify all variables involved in a cluster into // three types: inputs, outputs, and internals. -// Specially, the internal node is a node that only used by sub-graph, and +// The input node is some subgraph op's input but not any subgraph op's output. +// The output node is some subgraph op's output and some out-graph op's input. +// Specially, the internal node is a node that only used by subgraph, and // out-graph should not using this node at all. -// inputs & outputs & internals == NULL -// inputs | outputs | internals == all graph node +// cluster_inputs & cluster_outputs & cluster_internals == NULL +// cluster_outputs | cluster_internals == all graph op's outputs node void AnalyseClusterVariables(const GraphNodeSet& cluster, GraphNodeSet* cluster_inputs, GraphNodeSet* cluster_outputs, @@ -154,10 +250,6 @@ void AnalyseClusterVariables(const GraphNodeSet& cluster, } } - // if a output node also exists in input list, remove. - for (auto* var_node : *cluster_inputs) { - cluster_outputs->erase(var_node); - } // if a output node also exists in internal list, remove. for (auto* var_node : *cluster_internals) { cluster_outputs->erase(var_node); @@ -206,14 +298,23 @@ void RemoveLinkFromCluster(const GraphNodeSet& cluster, // removing useless link from cluster_inputs to cluster for (auto* var_node : cluster_inputs) { - auto preserved_nodes = get_preserved_ops(var_node->outputs); - var_node->outputs.assign(preserved_nodes.begin(), preserved_nodes.end()); + auto preserved_ops = get_preserved_ops(var_node->outputs); + var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end()); + // According to SSA form, a var node must not be any two op's output, + // and the cluster_inputs var nodes is defined as an out-graph op's + // output, so the cluster_inputs var nodes are not any subgraph op's + // output. Do not reassign input list here. } // removing useless link from cluster to cluster_outputs for (auto* var_node : cluster_outputs) { - auto preserved_nodes = get_preserved_ops(var_node->inputs); - var_node->inputs.assign(preserved_nodes.begin(), preserved_nodes.end()); + auto preserved_ops = get_preserved_ops(var_node->inputs); + var_node->inputs.assign(preserved_ops.begin(), preserved_ops.end()); + + // Note that cluster_outputs var node maybe some subgraph op's input, + // here we need remove them. + preserved_ops = get_preserved_ops(var_node->outputs); + var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end()); } } @@ -272,7 +373,7 @@ void SearchAllSubgraphs(Graph* graph, &cluster_internals); cinn_subgraphs->emplace_back( - CreateNewSubGraph(cluster_set, cluster_internals)); + CreateNewSubGraph(cluster_set, cluster_internals, cluster_inputs)); // replacing subgraph to a new special op node ReplaceSubGraphWithSpecialOpNode(cluster_set, cluster_inputs, diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index 883d5c6fbfb39..bf68a2b554b7f 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -54,6 +54,35 @@ inline Node* GetNode(const std::unordered_set& nodes, [&op_name](const Node* node) { return node->Name() == op_name; }); } +inline bool CheckGraphIndependence(const std::unordered_set& nodes) { + auto check_node_ok = [&nodes](Node* n1, Node* n2) -> bool { + if (n1->IsOp() && !n2->IsVar()) { + return false; + } + if (n1->IsVar() && !n2->IsOp()) { + return false; + } + if (nodes.count(n2) == 0) { + return false; + } + return true; + }; + + for (auto node : nodes) { + for (auto in : node->inputs) { + if (!check_node_ok(node, in)) { + return false; + } + } + for (auto out : node->outputs) { + if (!check_node_ok(node, out)) { + return false; + } + } + } + return true; +} + std::unique_ptr BuildNoCinnSubgraph() { ProgramDesc prog; auto g = std::make_unique(prog); @@ -67,6 +96,8 @@ std::unique_ptr BuildNoCinnSubgraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); @@ -109,6 +140,7 @@ TEST(BuildCinnPassTest, NoCinnSubgraph) { // After search, origin graph should no change ASSERT_EQ(previous_nodes, g->Nodes()); + ASSERT_TRUE(CheckGraphIndependence(g->Nodes())); // After search, there should one cinn subgraph ASSERT_TRUE(cinn_subgraphs.empty()); @@ -119,11 +151,8 @@ std::unique_ptr BuildAllOpSupportCinnGraph() { auto g = std::make_unique(prog); // v1 -- - // | // | --> mul --> v3 -- - // | | // v2 -- | --> add --> v5 --> relu --> v6 - // | // v4 -- OpDesc add_op; @@ -135,6 +164,8 @@ std::unique_ptr BuildAllOpSupportCinnGraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); VarDesc var5("var5"); @@ -192,6 +223,7 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) { // v4 --| const auto& nodes = g->Nodes(); ASSERT_EQ(nodes.size(), static_cast(5)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); // A new op named kCinnLaunchOp should be added ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); @@ -214,16 +246,34 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) { ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); // After search, there should has just one cinn subgraph - // mul --> v3 --> add --> v5 --> relu + // feed --> v1 -- + // | --> mul --> v3 -- + // v2 -- | --> add --> v5 --> relu --> v6 + // feed --> v4 -- ASSERT_EQ(cinn_subgraphs.size(), static_cast(1)); const auto& subgraph = cinn_subgraphs.back(); const auto& subnodes = subgraph->Nodes(); - ASSERT_EQ(subnodes.size(), static_cast(5)); + ASSERT_EQ(subnodes.size(), static_cast(11)); + ASSERT_TRUE(CheckGraphIndependence(subnodes)); ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); ASSERT_TRUE(CheckNodeExisted(subnodes, "add")); ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); + ASSERT_EQ(CountNode(subnodes, "feed"), 2); + + // No-parameter input should has feed op + auto new_v1 = GetNode(subnodes, "var1"); + ASSERT_EQ(new_v1->inputs.size(), static_cast(1)); + ASSERT_EQ(new_v1->outputs.size(), static_cast(1)); + ASSERT_EQ(new_v1->inputs[0]->Name(), "feed"); + ASSERT_EQ(new_v1->outputs[0]->Name(), "mul"); + + // Parameter input should not has feed op + auto new_v2 = GetNode(subnodes, "var2"); + ASSERT_TRUE(new_v2->inputs.empty()); + ASSERT_EQ(new_v2->outputs.size(), static_cast(1)); + ASSERT_EQ(new_v2->outputs[0]->Name(), "mul"); } std::unique_ptr BuildGraphWithOneCinnSubgraph() { @@ -231,9 +281,7 @@ std::unique_ptr BuildGraphWithOneCinnSubgraph() { auto g = std::make_unique(prog); // fake1 --> v1 -- - // | // | --> mul --> v3 --> relu --> v4 --> fake2 - // | // v2 -- OpDesc fake1_op; @@ -247,6 +295,8 @@ std::unique_ptr BuildGraphWithOneCinnSubgraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); @@ -299,6 +349,7 @@ TEST(BuildCinnPassTest, OneCinnSubgraph) { // v2 -- const auto& nodes = g->Nodes(); ASSERT_EQ(nodes.size(), static_cast(6)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); // A new op named kCinnLaunchOp should be added ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); @@ -312,15 +363,19 @@ TEST(BuildCinnPassTest, OneCinnSubgraph) { ASSERT_TRUE(CheckNodeExisted(nodes, "fake2")); // After search, there should has just one cinn subgraph - // mul --> v3 --> relu + // feed --> v1 -- + // | --> mul --> v3 --> relu --> v4 + // v2 -- ASSERT_EQ(cinn_subgraphs.size(), static_cast(1)); const auto& subgraph = cinn_subgraphs.back(); const auto& subnodes = subgraph->Nodes(); - ASSERT_EQ(subnodes.size(), static_cast(3)); + ASSERT_EQ(subnodes.size(), static_cast(7)); + ASSERT_TRUE(CheckGraphIndependence(subnodes)); ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); + ASSERT_EQ(CountNode(subnodes, "feed"), 1); } std::unique_ptr BuildGraphWithMultiCinnSubgraph() { @@ -328,9 +383,7 @@ std::unique_ptr BuildGraphWithMultiCinnSubgraph() { auto g = std::make_unique(prog); // fake1 --> v1 -- - // | // | --> mul --> v3 --> fake2 --> v4 --> relu --> v5 --> fake3 - // | // v2 -- OpDesc fake1_op; @@ -346,6 +399,8 @@ std::unique_ptr BuildGraphWithMultiCinnSubgraph() { VarDesc var1("var1"); VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); VarDesc var3("var3"); VarDesc var4("var4"); VarDesc var5("var5"); @@ -406,6 +461,7 @@ TEST(BuildCinnPassTest, MultiCinnSubgraph) { // v2 - const auto& nodes = g->Nodes(); ASSERT_EQ(nodes.size(), static_cast(10)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); // A new op named kCinnLaunchOp should be added ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); @@ -424,15 +480,27 @@ TEST(BuildCinnPassTest, MultiCinnSubgraph) { // and each of subgraphs just has one node. ASSERT_EQ(cinn_subgraphs.size(), static_cast(2)); - // subgraph1: relu + // subgraph1: + // feed --> v4 --> relu --> v5 + // subgraph2: + // feed --> v1 -- + // | --> mul --> v3 + // v2 -- const auto& subgraph1 = cinn_subgraphs[0]; const auto& subnodes1 = subgraph1->Nodes(); - ASSERT_EQ(subnodes1.size(), static_cast(1)); + ASSERT_TRUE(CheckGraphIndependence(subnodes1)); - // subgraph2: mul const auto& subgraph2 = cinn_subgraphs[1]; const auto& subnodes2 = subgraph2->Nodes(); - ASSERT_EQ(subnodes2.size(), static_cast(1)); + ASSERT_TRUE(CheckGraphIndependence(subnodes2)); + + if (CheckNodeExisted(subnodes1, "relu")) { + ASSERT_EQ(subnodes1.size(), static_cast(4)); + ASSERT_EQ(subnodes2.size(), static_cast(5)); + } else { + ASSERT_EQ(subnodes2.size(), static_cast(4)); + ASSERT_EQ(subnodes1.size(), static_cast(5)); + } } } // namespace paddle2cinn From be6a83301e04389902137fee6aee41134e83f4f3 Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 19 Oct 2021 15:49:13 +0800 Subject: [PATCH 029/116] Inference add type check in copy_from_cpu (#36429) * update * fix ut error * update ut --- .../fluid/inference/api/analysis_predictor.cc | 18 ++++++ .../api/analysis_predictor_tester.cc | 9 +++ .../inference/api/paddle_inference_api.h | 2 + paddle/fluid/inference/tensorrt/engine.cc | 13 ++++ paddle/fluid/inference/tensorrt/helper.h | 16 +++++ paddle/fluid/pybind/inference_api.cc | 11 ++-- python/paddle/fluid/inference/__init__.py | 2 +- python/paddle/fluid/inference/wrapper.py | 15 +++++ .../tests/unittests/test_inference_api.py | 59 +++++++++++++++++++ python/paddle/inference/__init__.py | 4 ++ 10 files changed, 144 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index dfa27037205f1..491ed71c4bccc 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -36,6 +36,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -56,6 +57,7 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #endif @@ -1471,6 +1473,22 @@ int GetNumBytesOfDataType(DataType dtype) { std::string GetVersion() { return paddle::get_version(); } +std::tuple GetTrtCompileVersion() { +#ifdef PADDLE_WITH_TENSORRT + return paddle::inference::tensorrt::GetTrtCompileVersion(); +#else + return std::tuple{0, 0, 0}; +#endif +} + +std::tuple GetTrtRuntimeVersion() { +#ifdef PADDLE_WITH_TENSORRT + return paddle::inference::tensorrt::GetTrtRuntimeVersion(); +#else + return std::tuple{0, 0, 0}; +#endif +} + std::string UpdateDllFlag(const char *name, const char *value) { return paddle::UpdateDllFlag(name, value); } diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 86fbde00075f0..a15a1cd84b140 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -359,6 +359,15 @@ TEST(AnalysisPredictor, set_xpu_device_id) { namespace paddle_infer { TEST(Predictor, Run) { + auto trt_compile_ver = GetTrtCompileVersion(); + auto trt_runtime_ver = GetTrtRuntimeVersion(); + LOG(INFO) << "trt compile version: " << std::get<0>(trt_compile_ver) << "." + << std::get<1>(trt_compile_ver) << "." + << std::get<2>(trt_compile_ver); + LOG(INFO) << "trt runtime version: " << std::get<0>(trt_runtime_ver) << "." + << std::get<1>(trt_runtime_ver) << "." + << std::get<2>(trt_runtime_ver); + Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index a516abb1432ca..35b90bfa54f73 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -169,6 +169,8 @@ PD_INFER_DECL std::shared_ptr CreatePredictor( PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype); PD_INFER_DECL std::string GetVersion(); +PD_INFER_DECL std::tuple GetTrtCompileVersion(); +PD_INFER_DECL std::tuple GetTrtRuntimeVersion(); PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value); namespace services { diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 24644645eee49..26182a7932199 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -190,6 +190,19 @@ void TensorRTEngine::FreezeNetwork() { #if IS_TRT_VERSION_GE(6000) LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; for (auto &input : min_input_shape_) { +#if IS_TRT_VERSION_LT(7000) + // trt6 will check all_of input > 0 + if (!(std::all_of(input.second.begin(), input.second.end(), + [](int x) { return x > 0; }) && + std::all_of(max_input_shape_[input.first].begin(), + max_input_shape_[input.first].end(), + [](int x) { return x > 0; }) && + std::all_of(optim_input_shape_[input.first].begin(), + optim_input_shape_[input.first].end(), + [](int x) { return x > 0; }))) { + continue; + } +#endif VLOG(4) << "TRT dynamic_shape set " << input.first << " min: " << Vec2Str(input.second) << ", max: " << Vec2Str(max_input_shape_[input.first]) diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 16595b8a03298..b8051d8610442 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -73,8 +73,24 @@ static nvinfer1::IPluginRegistry* GetPluginRegistry() { static int GetInferLibVersion() { return static_cast(dy::getInferLibVersion()); } +#else +static int GetInferLibVersion() { return 0; } #endif +static std::tuple GetTrtRuntimeVersion() { + int ver = GetInferLibVersion(); + int major = ver / 1000; + ver -= major * 1000; + int minor = ver / 100; + int patch = ver - minor * 100; + return std::tuple{major, minor, patch}; +} + +static std::tuple GetTrtCompileVersion() { + return std::tuple{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, + NV_TENSORRT_PATCH}; +} + // A logger for create TensorRT infer builder. class NaiveLogger : public nvinfer1::ILogger { public: diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 8ce7bea2d8e70..e02f25ff636a2 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -330,6 +330,8 @@ void BindInferenceApi(py::module *m) { m->def("paddle_dtype_size", &paddle::PaddleDtypeSize); m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes); m->def("get_version", &paddle_infer::GetVersion); + m->def("get_trt_compile_version", &paddle_infer::GetTrtCompileVersion); + m->def("get_trt_runtime_version", &paddle_infer::GetTrtRuntimeVersion); m->def("get_num_bytes_of_data_type", &paddle_infer::GetNumBytesOfDataType); } @@ -739,10 +741,11 @@ void BindZeroCopyTensor(py::module *m) { void BindPaddleInferTensor(py::module *m) { py::class_(*m, "PaddleInferTensor") .def("reshape", &paddle_infer::Tensor::Reshape) - .def("copy_from_cpu", &PaddleInferTensorCreate) - .def("copy_from_cpu", &PaddleInferTensorCreate) - .def("copy_from_cpu", &PaddleInferTensorCreate) - .def("copy_from_cpu", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", + &PaddleInferTensorCreate) .def("copy_to_cpu", &PaddleInferTensorToNumpy) .def("shape", &paddle_infer::Tensor::shape) .def("set_lod", &paddle_infer::Tensor::SetLoD) diff --git a/python/paddle/fluid/inference/__init__.py b/python/paddle/fluid/inference/__init__.py index 3013c1f2aff87..946b4f0c8d7b2 100644 --- a/python/paddle/fluid/inference/__init__.py +++ b/python/paddle/fluid/inference/__init__.py @@ -14,4 +14,4 @@ from .wrapper import Config, DataType, PlaceType, PrecisionType, Tensor, Predictor -from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool +from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool, get_trt_compile_version, get_trt_runtime_version diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py index 96885edcc5e82..2c1b2c77504d9 100644 --- a/python/paddle/fluid/inference/wrapper.py +++ b/python/paddle/fluid/inference/wrapper.py @@ -15,9 +15,24 @@ from ..core import AnalysisConfig, PaddleDType, PaddlePlace from ..core import PaddleInferPredictor, PaddleInferTensor +import numpy as np + DataType = PaddleDType PlaceType = PaddlePlace PrecisionType = AnalysisConfig.Precision Config = AnalysisConfig Tensor = PaddleInferTensor Predictor = PaddleInferPredictor + + +def tensor_copy_from_cpu(self, data): + ''' + Support input type check based on tensor.copy_from_cpu. + ''' + if not isinstance(data, np.ndarray): + raise TypeError( + "In copy_from_cpu, we only support numpy ndarray data type.") + self.copy_from_cpu_bind(data) + + +Tensor.copy_from_cpu = tensor_copy_from_cpu diff --git a/python/paddle/fluid/tests/unittests/test_inference_api.py b/python/paddle/fluid/tests/unittests/test_inference_api.py index 98ec0b3db04c4..7ed908eb33b81 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_api.py +++ b/python/paddle/fluid/tests/unittests/test_inference_api.py @@ -14,10 +14,14 @@ import os, shutil import unittest +import paddle +paddle.enable_static() import numpy as np import paddle.fluid as fluid from paddle.fluid.core import PaddleTensor from paddle.fluid.core import PaddleDType +from paddle.inference import Config, Predictor, create_predictor +from paddle.inference import get_trt_compile_version, get_trt_runtime_version class TestInferenceApi(unittest.TestCase): @@ -54,5 +58,60 @@ def test_inference_api(self): tensor_float.ravel().tolist()) +def get_sample_model(): + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + data = fluid.data(name="data", shape=[-1, 6, 64, 64], dtype="float32") + conv_out = fluid.layers.conv2d( + input=data, + num_filters=3, + filter_size=3, + groups=1, + padding=0, + bias_attr=False, + act=None) + exe.run(startup_program) + serialized_program = paddle.static.serialize_program( + data, conv_out, program=main_program) + serialized_params = paddle.static.serialize_persistables( + data, conv_out, executor=exe, program=main_program) + return serialized_program, serialized_params + + +class TestInferenceBaseAPI(unittest.TestCase): + def get_config(self, model, params): + config = Config() + config.set_model_buffer(model, len(model), params, len(params)) + config.enable_use_gpu(100, 0) + return config + + def test_apis(self): + print('trt compile version:', get_trt_compile_version()) + print('trt runtime version:', get_trt_runtime_version()) + program, params = get_sample_model() + config = self.get_config(program, params) + predictor = create_predictor(config) + in_names = predictor.get_input_names() + in_handle = predictor.get_input_handle(in_names[0]) + in_data = np.ones((1, 6, 32, 32)).astype(np.float32) + in_handle.copy_from_cpu(in_data) + predictor.run() + + def test_wrong_input(self): + with self.assertRaises(TypeError): + program, params = get_sample_model() + config = self.get_config(program, params) + predictor = create_predictor(config) + in_names = predictor.get_input_names() + in_handle = predictor.get_input_handle(in_names[0]) + in_data = np.ones((1, 6, 64, 64)).astype(np.float32) + in_handle.copy_from_cpu(list(in_data)) + predictor.run() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py index 4e17203971662..ec5295b6dfe56 100644 --- a/python/paddle/inference/__init__.py +++ b/python/paddle/inference/__init__.py @@ -20,6 +20,8 @@ from ..fluid.inference import Predictor # noqa: F401 from ..fluid.inference import create_predictor # noqa: F401 from ..fluid.inference import get_version # noqa: F401 +from ..fluid.inference import get_trt_compile_version # noqa: F401 +from ..fluid.inference import get_trt_runtime_version # noqa: F401 from ..fluid.inference import get_num_bytes_of_data_type # noqa: F401 from ..fluid.inference import PredictorPool # noqa: F401 @@ -32,6 +34,8 @@ 'Predictor', 'create_predictor', 'get_version', + 'get_trt_compile_version', + 'get_trt_runtime_version', 'get_num_bytes_of_data_type', 'PredictorPool' ] From 9e4944725d7ad61ef2092dacdf0fecec78cac3fd Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Tue, 19 Oct 2021 15:49:57 +0800 Subject: [PATCH 030/116] [heterps]edit shrink and unseenday logit for pslib (#36194) --- paddle/fluid/framework/fleet/fleet_wrapper.cc | 23 ++++++++++++ paddle/fluid/framework/fleet/fleet_wrapper.h | 2 ++ .../framework/fleet/heter_ps/hashtable_inl.h | 2 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 13 +++++++ paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 9 +++++ paddle/fluid/pybind/fleet_wrapper_py.cc | 1 + paddle/fluid/pybind/ps_gpu_wrapper_py.cc | 2 ++ .../distributed/fleet/dataset/dataset.py | 36 +++++++++++++++++++ python/paddle/fluid/dataset.py | 23 ++++++++++++ .../fleet/parameter_server/pslib/__init__.py | 9 +++++ .../unittests/test_communicator_ps_gpu.py | 2 +- 11 files changed, 120 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 4346c144fab7f..7aeb9eaf3f195 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -1334,6 +1334,29 @@ void FleetWrapper::SaveModelOneTablePrefix(const uint64_t table_id, #endif } +void FleetWrapper::SetDate(const uint64_t table_id, const std::string& date) { +#ifdef PADDLE_WITH_PSLIB + assert(date.size() == 8); + int year = std::stoi(date.substr(0, 4)); + int month = std::stoi(date.substr(4, 2)); + int day = std::stoi(date.substr(6, 2)); + struct std::tm b; + b.tm_year = year - 1900; + b.tm_mon = month - 1; + b.tm_mday = day; + b.tm_hour = b.tm_min = b.tm_sec = 0; + std::time_t seconds_from_1970 = std::mktime(&b); + int day_id = seconds_from_1970 / 86400; + auto ret = pslib_ptr_->_worker_ptr->set_day_id(table_id, day_id); + ret.wait(); + if (ret.get() != 0) { + LOG(ERROR) << "setdate : " << date << " failed"; + } +#else + VLOG(0) << "FleetWrapper::SetDate does nothing when no pslib"; +#endif +} + void FleetWrapper::PrintTableStat(const uint64_t table_id) { #ifdef PADDLE_WITH_PSLIB auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id); diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index d368b421ff2a0..6fddedccf0258 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -336,6 +336,8 @@ class FleetWrapper { // this performs better than rand_r, especially large data std::default_random_engine& LocalRandomEngine(); + void SetDate(const uint64_t table_id, const std::string& date); + #ifdef PADDLE_WITH_PSLIB static std::shared_ptr pslib_ptr_; #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index 9facbff1f2526..9f3d1a7adcafc 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -128,7 +128,7 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { downpour_value->resize(gpu_val.mf_size + downpour_value_size); } float* cpu_val = downpour_value->data(); - cpu_val[0] = 0; + // cpu_val[0] = 0; cpu_val[1] = gpu_val.delta_score; cpu_val[2] = gpu_val.show; cpu_val[3] = gpu_val.clk; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index d1e98a711dc9d..d3990c1f3dd76 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -181,6 +181,19 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { VLOG(3) << "GpuPs shard: " << i << " key len: " << local_keys[i].size(); local_ptr[i].resize(local_keys[i].size()); } + +#ifdef PADDLE_WITH_PSLIB + // get day_id: day nums from 1970 + struct std::tm b; + b.tm_year = year_ - 1900; + b.tm_mon = month_ - 1; + b.tm_mday = day_; + b.tm_min = b.tm_hour = b.tm_sec = 0; + std::time_t seconds_from_1970 = std::mktime(&b); + int day_id = seconds_from_1970 / 86400; + fleet_ptr->pslib_ptr_->_worker_ptr->set_day_id(table_id_, day_id); +#endif + timeline.Start(); auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index fa2ff6cbdb8c7..6f785cad33e2d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -240,6 +240,12 @@ class PSGPUWrapper { mf_max_bound); } } + void SetDate(int year, int month, int day) { + year_ = year; + month_ = month; + day_ = day; + } + void SetDataset(Dataset* dataset) { dataset_ = dataset; } // PSGPUWrapper singleton @@ -283,6 +289,9 @@ class PSGPUWrapper { int thread_keys_thread_num_ = 37; int thread_keys_shard_num_ = 37; uint64_t max_fea_num_per_pass_ = 5000000000; + int year_; + int month_; + int day_; std::shared_ptr< paddle::framework::ChannelObject>> diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index d8142f717baed..af1c3da727d41 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -91,6 +91,7 @@ void BindFleetWrapper(py::module* m) { .def("save_model_one_table", &framework::FleetWrapper::SaveModelOneTable) .def("save_model_one_table_with_prefix", &framework::FleetWrapper::SaveModelOneTablePrefix) + .def("set_date", &framework::FleetWrapper::SetDate) .def("copy_table", &framework::FleetWrapper::CopyTable) .def("copy_table_by_feasign", &framework::FleetWrapper::CopyTableByFeasign); diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 48365f42b11ba..6e98a9479fa26 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -41,6 +41,8 @@ void BindPSGPUWrapper(py::module* m) { py::call_guard()) .def("init_GPU_server", &framework::PSGPUWrapper::InitializeGPUServer, py::call_guard()) + .def("set_date", &framework::PSGPUWrapper::SetDate, + py::call_guard()) .def("set_dataset", &framework::PSGPUWrapper::SetDataset, py::call_guard()) .def("init_gpu_ps", &framework::PSGPUWrapper::InitializeGPU, diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py index 25a1d98cb1121..e231ac55e679a 100644 --- a/python/paddle/distributed/fleet/dataset/dataset.py +++ b/python/paddle/distributed/fleet/dataset/dataset.py @@ -748,6 +748,42 @@ def _generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num, self.dataset.generate_local_tables_unlock( table_id, fea_dim, read_thread_num, consume_thread_num, shard_num) + def set_date(self, date): + """ + :api_attr: Static Graph + + Set training date for pull sparse parameters, saving and loading model. Only used in psgpu + + Args: + date(str): training date(format : YYMMDD). eg.20211111 + + Examples: + .. code-block:: python + + import paddle + paddle.enable_static() + + dataset = paddle.distributed.InMemoryDataset() + slots = ["slot1", "slot2", "slot3", "slot4"] + slots_vars = [] + for slot in slots: + var = paddle.static.data( + name=slot, shape=[None, 1], dtype="int64", lod_level=1) + slots_vars.append(var) + dataset.init( + batch_size=1, + thread_num=2, + input_type=1, + pipe_command="cat", + use_var=slots_vars) + dataset.set_date("20211111") + """ + year = int(date[:4]) + month = int(date[4:6]) + day = int(date[6:]) + if self.use_ps_gpu and core._is_compiled_with_heterps(): + self.psgpu.set_date(year, month, day) + def load_into_memory(self, is_shuffle=False): """ :api_attr: Static Graph diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index d683e36fbe5ab..972f59d1e9058 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -716,6 +716,29 @@ def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num, self.dataset.generate_local_tables_unlock( table_id, fea_dim, read_thread_num, consume_thread_num, shard_num) + def set_date(self, date): + """ + :api_attr: Static Graph + + Set training date for pull sparse parameters, saving and loading model. Only used in psgpu + + Args: + date(str): training date(format : YYMMDD). eg.20211111 + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_date("20211111") + """ + year = int(date[:4]) + month = int(date[4:6]) + day = int(date[6:]) + if self.use_ps_gpu and core._is_compiled_with_heterps(): + self.psgpu.set_date(year, month, day) + @deprecated( since="2.0.0", update_to="paddle.distributed.InMemoryDataset.load_into_memory") diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 78af7fd65dccb..309532cafc2e1 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -799,6 +799,15 @@ def save_one_table(self, table_id, model_dir, **kwargs): self._fleet_ptr.save_model_one_table(table_id, model_dir, mode) self._role_maker._barrier_worker() + def set_date(self, table_id, date): + """ + set_date, eg, 20210918 + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.set_date(table_id, str(date)) + self._role_maker._barrier_worker() + def _set_opt_info(self, opt_info): """ this function saves the result from DistributedOptimizer.minimize() diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py index 6ab8a2c3a4b22..1faa084d412e4 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py @@ -74,6 +74,7 @@ def test_communicator_ps_gpu(self): batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars) dataset.set_filelist(["test_communicator_ps_gpu.txt"]) dataset._set_use_ps_gpu(1) + dataset.set_date("20211111") dataset.load_into_memory(is_shuffle=True) os.environ["TEST_MODE"] = "1" @@ -88,7 +89,6 @@ def test_communicator_ps_gpu(self): pass except Exception as e: self.assertTrue(False) - time.sleep(10) fleet.stop_worker() os.remove("./test_communicator_ps_gpu.txt") From 49d7bd38448b7b876a08af8c8afb1062d9469f14 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 19 Oct 2021 15:56:57 +0800 Subject: [PATCH 031/116] [NPU] update inference cmake, test=develop (#36505) * [NPU] update inference cmake, test=develop * address review comments, test=develop * fix compile error when WITH_ASCEND_CXX11 ON, test=develop --- cmake/external/ascend.cmake | 32 +++++++++++++++++++++++++++ cmake/inference_lib.cmake | 9 +++++++- cmake/miopen.cmake | 2 -- paddle/fluid/platform/resource_pool.h | 1 + 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index 414b2a54be034..b643923cdd353 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -85,5 +85,37 @@ if(WITH_ASCEND_CL) ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) +endif() +if (WITH_ASCEND_CL) +macro(find_ascend_toolkit_version ascend_toolkit_version_info) + file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) + string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") + string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") + if(NOT ASCEND_TOOLKIT_VERSION) + set(ASCEND_TOOLKIT_VERSION "???") + else() + message(STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}") + endif() +endmacro() + +macro(find_ascend_driver_version ascend_driver_version_info) + file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS) + string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION_CONTENTS}") + string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}") + if(NOT ASCEND_DRIVER_VERSION) + set(ASCEND_DRIVER_VERSION "???") + else() + message(STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}") + endif() +endmacro() + +if (WITH_ARM) + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux) +else() + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux) endif() + +find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) +find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) +endif() \ No newline at end of file diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index cb2ed614d3d7c..5ffbf15c960a3 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -353,7 +353,9 @@ function(version version_file) "WITH_MKL: ${WITH_MKL}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n" - "WITH_ROCM: ${WITH_ROCM}\n") + "WITH_ROCM: ${WITH_ROCM}\n" + "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" + "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" @@ -364,6 +366,11 @@ function(version version_file) "HIP version: ${HIP_VERSION}\n" "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") endif() + if(WITH_ASCEND_CL) + file(APPEND ${version_file} + "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" + "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") + endif() file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") if(TENSORRT_FOUND) file(APPEND ${version_file} diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake index f482f423dc5c1..493c37955f725 100644 --- a/cmake/miopen.cmake +++ b/cmake/miopen.cmake @@ -15,8 +15,6 @@ find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h" NO_DEFAULT_PATH ) -get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) - find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so" PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 diff --git a/paddle/fluid/platform/resource_pool.h b/paddle/fluid/platform/resource_pool.h index 3603c0f24f279..f01d006d5b273 100644 --- a/paddle/fluid/platform/resource_pool.h +++ b/paddle/fluid/platform/resource_pool.h @@ -16,6 +16,7 @@ #include #include +#include #include #include #include From f2612462bd0dcc87f406e458240155d2c9108613 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 19 Oct 2021 16:54:54 +0800 Subject: [PATCH 032/116] fix op_flops not define. test=develop (#36489) --- python/paddle/hapi/static_flops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py index 07fc19b2cb89a..f386bbd0dd6db 100644 --- a/python/paddle/hapi/static_flops.py +++ b/python/paddle/hapi/static_flops.py @@ -176,6 +176,7 @@ def count_element_op(op): def _graph_flops(graph, detail=False): assert isinstance(graph, GraphWrapper) flops = 0 + op_flops = 0 table = Table(["OP Type", 'Param name', "Flops"]) for op in graph.ops(): param_name = '' From 999242e35f450e2904df22a56ca8954f1811dbf8 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Tue, 19 Oct 2021 19:32:30 +0800 Subject: [PATCH 033/116] [NPU] Add iou_similarity op (#36412) * [NPU] Add iou_similarity op * [NPU] Add iou_similarity op * [NPU] Add iou_similarity op --- .../fluid/operators/detection/CMakeLists.txt | 2 + .../detection/iou_similarity_op_npu.cc | 192 ++++++++++++++++++ .../npu/test_iou_similarity_op_npu.py | 126 ++++++++++++ 3 files changed, 320 insertions(+) create mode 100644 paddle/fluid/operators/detection/iou_similarity_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 871240aa15fce..506ae56a12642 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -64,6 +64,8 @@ endif() if(WITH_XPU) detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc) +elseif(WITH_ASCEND_CL) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc) else() detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) endif() diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc new file mode 100644 index 0000000000000..9a91d4bd8fac1 --- /dev/null +++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/iou_similarity_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct IouFunction { + public: + explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + void Transpose(const Tensor* x, Tensor* y, const std::vector& axis) { + // y should be init first + const auto& runner = + NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + void Maximum(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Minimum(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +class IouSimilarityNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + bool normalized = ctx.Attr("box_normalized"); + auto* out = ctx.Output("Out"); + + auto _type = x->type(); + auto place = ctx.GetPlace(); + + IouFunction F(ctx); + + auto N = x->dims()[0]; + auto M = y->dims()[0]; + + out->mutable_data({N, M}, place); + Tensor xt(_type); + Tensor yt(_type); + xt.mutable_data({4, N}, place); + yt.mutable_data({4, M}, place); + std::vector vec_trans = {1, 0}; + F.Transpose(x, &xt, vec_trans); + F.Transpose(y, &yt, vec_trans); + Tensor xmin1 = xt.Slice(0, 1); + Tensor ymin1 = xt.Slice(1, 2); + Tensor xmax1 = xt.Slice(2, 3); + Tensor ymax1 = xt.Slice(3, 4); + Tensor xmin2 = yt.Slice(0, 1); + Tensor ymin2 = yt.Slice(1, 2); + Tensor xmax2 = yt.Slice(2, 3); + Tensor ymax2 = yt.Slice(3, 4); + xmin1.Resize({N, 1}); + ymin1.Resize({N, 1}); + xmax1.Resize({N, 1}); + ymax1.Resize({N, 1}); + xmin2.Resize({1, M}); + ymin2.Resize({1, M}); + xmax2.Resize({1, M}); + ymax2.Resize({1, M}); + + Tensor w1(_type); + Tensor h1(_type); + Tensor w2(_type); + Tensor h2(_type); + Tensor area1(_type); + Tensor area2(_type); + w1.mutable_data({N, 1}, place); + h1.mutable_data({N, 1}, place); + w2.mutable_data({1, M}, place); + h2.mutable_data({1, M}, place); + area1.mutable_data({N, 1}, place); + area2.mutable_data({1, M}, place); + F.Sub(&xmax1, &xmin1, &w1); + F.Sub(&ymax1, &ymin1, &h1); + F.Sub(&xmax2, &xmin2, &w2); + F.Sub(&ymax2, &ymin2, &h2); + if (!normalized) { + F.Adds(&w1, 1.0f, &w1); + F.Adds(&h1, 1.0f, &h1); + F.Adds(&w2, 1.0f, &w2); + F.Adds(&h2, 1.0f, &h2); + } + F.Mul(&w1, &h1, &area1); + F.Mul(&w2, &h2, &area2); + + Tensor inter_xmax(_type); + Tensor inter_ymax(_type); + Tensor inter_xmin(_type); + Tensor inter_ymin(_type); + inter_xmax.mutable_data({N, M}, place); + inter_ymax.mutable_data({N, M}, place); + inter_xmin.mutable_data({N, M}, place); + inter_ymin.mutable_data({N, M}, place); + F.Minimum(&xmax1, &xmax2, &inter_xmax); + F.Minimum(&ymax1, &ymax2, &inter_ymax); + F.Maximum(&xmin1, &xmin2, &inter_xmin); + F.Maximum(&ymin1, &ymin2, &inter_ymin); + + Tensor inter_w(_type); + Tensor inter_h(_type); + inter_w.mutable_data({N, M}, place); + inter_h.mutable_data({N, M}, place); + F.Sub(&inter_xmax, &inter_xmin, &inter_w); + F.Sub(&inter_ymax, &inter_ymin, &inter_h); + + if (!normalized) { + F.Adds(&inter_w, 1.0f, &inter_w); + F.Adds(&inter_h, 1.0f, &inter_h); + } + Tensor zeros(_type); + zeros.mutable_data({1}, place); + FillNpuTensorWithConstant(&zeros, static_cast(0)); + F.Maximum(&inter_w, &zeros, &inter_w); + F.Maximum(&inter_h, &zeros, &inter_h); + + F.Mul(&inter_w, &inter_h, out); + Tensor union_area(_type); + union_area.mutable_data({N, M}, place); + F.Add(&area1, &area2, &union_area); + F.Sub(&union_area, out, &union_area); + F.DivNoNan(out, &union_area, out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(iou_similarity, ops::IouSimilarityNPUKernel, + ops::IouSimilarityNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py new file mode 100644 index 0000000000000..22042ce49200b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_iou_similarity_op_npu.py @@ -0,0 +1,126 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import numpy.random as random +import sys +sys.path.append("..") +import math +import paddle +from op_test import OpTest + +paddle.enable_static() + +np.random.seed(2021) + + +class TestNpuIouSimilarityOp(OpTest): + def setUp(self): + self.op_type = "iou_similarity" + self.set_npu() + self.init_dtype() + self.set_init_config() + self.set_attrs() + self.set_inputs() + self.set_outputs() + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def set_init_config(self): + self.N = 2 + self.M = 3 + self.box_normalized = False + self.use_lod = False + + def set_inputs(self): + self.boxes1 = random.rand(self.N, 4).astype(self.dtype) + self.boxes2 = random.rand(self.M, 4).astype(self.dtype) + if self.use_lod: + self.boxes1_lod = [[1 for _ in range(self.N)]] + self.inputs = { + 'X': (self.boxes1, self.boxes1_lod), + 'Y': self.boxes2 + } + else: + self.inputs = {'X': self.boxes1, 'Y': self.boxes2} + + def set_attrs(self): + self.attrs = {"box_normalized": self.box_normalized} + + def set_outputs(self): + self.output = random.rand(self.N, self.M).astype(self.dtype) + self._compute_iou() + self.outputs = {'Out': self.output} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def _compute_iou(self, ): + for row in range(self.boxes1.shape[0]): + for col in range(self.boxes2.shape[0]): + xmin1, ymin1, xmax1, ymax1 = self.boxes1[row] + xmin2, ymin2, xmax2, ymax2 = self.boxes2[col] + if not self.box_normalized: + area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1) + area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1) + else: + area1 = (ymax1 - ymin1) * (xmax1 - xmin1) + area2 = (ymax2 - ymin2) * (xmax2 - xmin2) + + inter_xmax = min(xmax1, xmax2) + inter_ymax = min(ymax1, ymax2) + inter_xmin = max(xmin1, xmin2) + inter_ymin = max(ymin1, ymin2) + inter_height = inter_ymax - inter_ymin + inter_width = inter_xmax - inter_xmin + if not self.box_normalized: + inter_height += 1 + inter_width += 1 + inter_height = max(inter_height, 0) + inter_width = max(inter_width, 0) + inter_area = inter_width * inter_height + union_area = area1 + area2 - inter_area + sim_score = inter_area / union_area + self.output[row, col] = sim_score + + +class TestNpuIouSimilarityOpWithLoD(TestNpuIouSimilarityOp): + def set_init_config(self): + super(TestNpuIouSimilarityOpWithLoD, self).set_init_config() + self.box_normalized = True + self.use_lod = True + + +class TestNpuIouSimilarityOpWithBoxNormalized(TestNpuIouSimilarityOp): + def set_init_config(self): + super(TestNpuIouSimilarityOpWithBoxNormalized, self).set_init_config() + self.box_normalized = True + self.use_lod = True + + +def TestNpuIouSimilarityOpFp16(TestNpuIouSimilarityOp): + def init_dtype(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() From 51c97d9f14048c60fa901f397e3ba540ec353226 Mon Sep 17 00:00:00 2001 From: Weilong Wu <87417304+veyron95@users.noreply.github.com> Date: Tue, 19 Oct 2021 19:37:06 +0800 Subject: [PATCH 034/116] Support elementwise_add triple grad Kernel (#36508) * Support elementwise_add triple grad Kernel * Change code-format to follow CI std --- .../elementwise/elementwise_add_op.cc | 47 ++++++++++++-- .../elementwise/elementwise_add_op.cu | 11 ++++ .../elementwise/elementwise_add_op.h | 39 ++++++++++++ .../operators/elementwise/elementwise_op.h | 61 +++++++++++++++++++ .../fluid/tests/unittests/gradient_checker.py | 12 +++- .../unittests/test_elementwise_nn_grad.py | 54 ++++++++++++++++ 6 files changed, 217 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 67e2e3a1e9677..d66d6b66a0582 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -110,6 +110,25 @@ class ElementwiseAddDoubleGradMaker : public framework::SingleGradOpMaker { } }; +template +class ElementwiseAddTripleGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("elementwise_add_triple_grad"); + op->SetInput("DDX", this->Input("DDX")); + op->SetInput("DDY", this->Input("DDY")); + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput("D_DDX", this->InputGrad("DDX")); + op->SetOutput("D_DDY", this->InputGrad("DDY")); + } +}; + } // namespace operators } // namespace paddle @@ -123,10 +142,16 @@ REGISTER_OPERATOR( ops::ElementwiseAddDoubleGradMaker, ops::ElementwiseAddDoubleGradMaker); -REGISTER_OPERATOR(elementwise_add_grad_grad, - ops::ElementwiseOpDoubleGradWithoutDXDY, - ops::ElementwiseDoubleGradOpInplaceInferer, - ops::ElementwiseDoubleGradNoBufVarsInferer); +REGISTER_OPERATOR( + elementwise_add_grad_grad, ops::ElementwiseOpDoubleGradWithoutDXDY, + ops::ElementwiseDoubleGradOpInplaceInferer, + ops::ElementwiseDoubleGradNoBufVarsInferer, + ops::ElementwiseAddTripleGradMaker, + ops::ElementwiseAddTripleGradMaker); + +REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad, + ops::ElementwiseTripleGradOpInplaceInferer, + ops::ElementwiseTripleGradNoBufVarsInferer); REGISTER_OP_CPU_KERNEL( elementwise_add, @@ -162,6 +187,20 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CPU_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 331867617bd78..0b78aa4a01a74 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -196,6 +196,17 @@ REGISTER_OP_CUDA_KERNEL( plat::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 6c61ce61eecd5..0ce4ca665dd9d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -205,5 +205,44 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel { } }; +template +class ElementwiseAddTripleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using Tensor = framework::Tensor; + auto *ddx = ctx.Input("DDX"); + auto *ddy = ctx.Input("DDY"); + auto *d_ddout = ctx.Input("D_DDOut"); + auto *d_ddx = ctx.Output("D_DDX"); + auto *d_ddy = ctx.Output("D_DDY"); + // skip out + auto *out = d_ddout; + + // Special case when d_ddy is not needed and d_ddx doesn't reduce + if (d_ddx != nullptr && d_ddy == nullptr && + d_ddx->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddy is not needed and d_ddx doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddx); + } else if (d_ddx == nullptr && d_ddy != nullptr && + d_ddy->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddx is not needed and d_ddy doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddy); + } else if (d_ddx != nullptr && d_ddy != nullptr && + (d_ddx->dims() == d_ddy->dims())) { + elementwise_add_grad(ctx, ddx, ddy, out, d_ddout, d_ddx, + d_ddy); + } else { + default_elementwise_add_grad(ctx, ddx, ddy, out, + d_ddout, d_ddx, d_ddy); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 3614602156f4d..5703e904c240b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -426,6 +426,62 @@ class ElementwiseOpDoubleGradWithoutDXDY } }; +class ElementwiseOpTripleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->HasOutput("D_DDX")) { + ctx->ShareDim("DDX", "D_DDX"); + ctx->ShareLoD("DDX", "D_DDX"); + } + if (ctx->HasOutput("D_DDY")) { + ctx->ShareDim("DDY", "D_DDY"); + ctx->ShareLoD("DDY", "D_DDY"); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::proto::VarType::Type input_data_type; + if (ctx.HasInput("DDX") == false) { + OP_INOUT_CHECK(ctx.HasInput("DDY"), "Input", "DDY", + "ElementwiseOpTripleGrad"); + input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDY"); + } else if (ctx.HasInput("DDY") == false) { + OP_INOUT_CHECK(ctx.HasInput("DDX"), "Input", "DDX", + "ElementwiseOpTripleGrad"); + input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX"); + } else { + input_data_type = + OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY"); + } + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + if (framework::IsComplexType(expected_kernel_type.data_type_)) { + // only promote inputsā€™s types when contains complex input + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } + } +}; + template class ElemwiseGradKernel : public framework::OpKernel { public: @@ -447,9 +503,14 @@ DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplaceInferer, DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplaceInferer, {"DDX", "DDOut"}); +DECLARE_INPLACE_OP_INFERER(ElementwiseTripleGradOpInplaceInferer, + {"D_DDOut", "D_DDX"}); + DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseGradNoBufVarsInferer, "X", "Y"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseDoubleGradNoBufVarsInferer, "Y", "DOut"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseTripleGradNoBufVarsInferer, + "DDX", "DDY"); } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py index 01aa2fd9efa4f..b56bbc07a7f44 100644 --- a/python/paddle/fluid/tests/unittests/gradient_checker.py +++ b/python/paddle/fluid/tests/unittests/gradient_checker.py @@ -486,20 +486,26 @@ def triple_grad_check(x, var_to_np_array_in_scope(scope, place, v.name) for v in x_grads_grads ] - # append second order grads - target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) x += y_grads x_init = _as_list(x_init) x_init += y_grads_init + # append second order grads + target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) + + # filter None in target_grads_grads for Dy/Dx may be None in kernel + filted = [(i, dyi) for i, dyi in enumerate(target_grads_grads) + if dyi is not None] + filted_idx, filted_target_grads_grads = zip(*filted) + x += x_grads_grads x_init += x_grads_grads_init # x <=> [x, dout, ddx] grad_check( x=x, - y=target_grads_grads, + y=filted_target_grads_grads, x_init=x_init, place=place, program=program, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py index 12b75c8bf703d..0dba2b1924d24 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py @@ -243,5 +243,59 @@ def test_grad(self): self.func(p) +class TestElementwiseAddTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + # the shape of input variable should be clearly specified, not inlcude -1. + shape = [2, 3, 4, 5] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + y = layers.data('y', shape, False, dtype) + x.persistable = True + y.persistable = True + out = layers.elementwise_add(x, y) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + y_arr = np.random.uniform(-1, 1, shape).astype(dtype) + + gradient_checker.triple_grad_check( + [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + +class TestElementwiseAddBroadcastTripleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + # the shape of input variable should be clearly specified, not inlcude -1. + shape = [2, 3, 4, 5] + eps = 0.005 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + y = layers.data('y', shape[:-1], False, dtype) + x.persistable = True + y.persistable = True + out = layers.elementwise_add(x, y, axis=0) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + y_arr = np.random.uniform(-1, 1, shape[:-1]).astype(dtype) + + gradient_checker.triple_grad_check( + [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + if __name__ == "__main__": unittest.main() From fe01ba6a14f9d8209fc07346c7701f953e8dba44 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Wed, 20 Oct 2021 10:16:52 +0800 Subject: [PATCH 035/116] remove no_value using var.name (#36513) * remove no_value using var.name * fix unit test for CI * fix unit test * add test case * fix test case * add more test case --- .../dygraph_to_static/convert_operators.py | 42 +++++++- .../dygraph_to_static/variable_trans_func.py | 6 +- .../test_convert_operators.py | 95 +++++++++++++++++++ .../test_program_translator.py | 4 +- .../test_variable_trans_func.py | 18 ++-- 5 files changed, 151 insertions(+), 14 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index 4126e94225943..d27af5c0dd9e0 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -20,6 +20,7 @@ from paddle.fluid.layers import assign, fill_constant, slice, reduce_all, reduce_any from paddle.fluid.layers import cast, control_flow, logical_and, logical_not, logical_or, nn from paddle.fluid.layers.control_flow import cond, while_loop, less_than, increment +from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_VAR_NAME def convert_while_loop(cond, body, loop_vars): @@ -204,10 +205,45 @@ def convert_ifelse(pred, true_fn, false_fn, true_args, false_args, return_vars): """ if isinstance(pred, Variable): - return _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args, - return_vars) + out = _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args, + return_vars) else: - return _run_py_ifelse(pred, true_fn, false_fn, true_args, false_args) + out = _run_py_ifelse(pred, true_fn, false_fn, true_args, false_args) + + return _remove_no_value_return_var(out) + + +def _remove_no_value_return_var(out): + if out and isinstance(out, tuple): + processed_out = out + align_ret = out[0] + if isinstance(align_ret, tuple): + for index, item in enumerate(align_ret): + if isinstance(item, Variable) and ( + RETURN_NO_VALUE_VAR_NAME in item.name): + # return None + if index == 0: + processed_out = (None, ) + out[1:] + elif index == 1: + processed_out = align_ret[:1] + out[1:] + else: + processed_out = (align_ret[:index], ) + out[1:] + break + + for index, item in enumerate(processed_out): + if isinstance(item, Variable) and ( + RETURN_NO_VALUE_VAR_NAME in item.name): + processed_out = processed_out[:index] + + if not processed_out: + return None + elif len(processed_out) == 1: + return processed_out[0] + else: + return processed_out + + else: + return out def _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args, diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py index b118eeadf7e7e..2cd6c5e43f7e1 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py @@ -93,14 +93,14 @@ def create_fill_constant_node(name, value): func_code = "{} = paddle.fluid.layers.fill_constant(shape=[1], ".format( name) if isinstance(value, bool): - func_code += "dtype='bool', value={})".format(value) + func_code += "dtype='bool', value={}, name='{}')".format(value, name) return gast.parse(func_code).body[0] if isinstance(value, float): - func_code += "dtype='float64', value={})".format(value) + func_code += "dtype='float64', value={}, name='{}')".format(value, name) return gast.parse(func_code).body[0] if isinstance(value, int): - func_code += "dtype='int64', value={})".format(value) + func_code += "dtype='int64', value={}, name='{}')".format(value, name) return gast.parse(func_code).body[0] diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py index 54dcc152fd6b2..bb1942692fd9d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py @@ -261,5 +261,100 @@ def test_tensor_shape(self): self.assertTrue(np.array_equal(out.numpy(), x.numpy())) +class TestIfElseNoValue(unittest.TestCase): + def test_else_ret_none(self): + input_x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + + @paddle.jit.to_static + def with_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + z = x - 1 + return None + + @paddle.jit.to_static + def without_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + return None + + out = with_common_value(input_x, False) + self.assertIsNone(out) + out = without_common_value(input_x, False) + self.assertIsNone(out) + + def test_else_ret_c(self): + input_x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + + @paddle.jit.to_static + def with_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + z = x - 1 + return c + + @paddle.jit.to_static + def without_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z + else: + c = x + 1 + return c + + out = with_common_value(input_x, False) + self.assertListEqual(paddle.tolist(out), paddle.tolist(input_x + 1)) + out = without_common_value(input_x, False) + self.assertListEqual(paddle.tolist(out), paddle.tolist(input_x + 1)) + y, z = with_common_value(input_x, True) + self.assertListEqual(paddle.tolist(y), paddle.tolist(input_x + 1)) + self.assertListEqual(paddle.tolist(z), paddle.tolist(input_x + 2)) + + def test_else_ret_cz(self): + input_x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]) + + @paddle.jit.to_static + def with_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z, 1 + else: + c = x + 1 + z = x - 1 + return c, z + + @paddle.jit.to_static + def without_common_value(x, use_cache=False): + if use_cache: + y = x + 1 + z = x + 2 + return y, z, 1 + else: + c = x + 1 + d = x - 1 + return c, d + + c, z = with_common_value(input_x, False) + self.assertListEqual(paddle.tolist(c), paddle.tolist(input_x + 1)) + self.assertListEqual(paddle.tolist(z), paddle.tolist(input_x - 1)) + c, d = without_common_value(input_x, False) + self.assertListEqual(paddle.tolist(c), paddle.tolist(input_x + 1)) + self.assertListEqual(paddle.tolist(d), paddle.tolist(input_x - 1)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index 9e12b6fa20850..6fef356326b81 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -64,7 +64,7 @@ def get_source_code(func): class StaticCode1(): def dyfunc_with_if_else(x_v, label=None): __return_value_init_0 = paddle.fluid.layers.fill_constant( - shape=[1], dtype='float64', value=0.0) + shape=[1], dtype='float64', value=0.0, name='__return_value_init_0') __return_value_0 = __return_value_init_0 def true_fn_0(x_v): @@ -116,7 +116,7 @@ class StaticCode2(): # TODO: Transform return statement def dyfunc_with_if_else(x_v, label=None): __return_value_init_1 = paddle.fluid.layers.fill_constant( - shape=[1], dtype='float64', value=0.0) + shape=[1], dtype='float64', value=0.0, name='__return_value_init_1') __return_value_1 = __return_value_init_1 def true_fn_3(x_v): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py index 3431c6aac4cbe..8500f46d974d8 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py @@ -50,16 +50,22 @@ def test_feed_mismatch_shape(self): class TestVariableTransFunc(unittest.TestCase): def test_create_fill_constant_node(self): node = create_fill_constant_node("a", 1.0) - source = "a = paddle.fluid.layers.fill_constant(shape=[1], dtype='float64', value=1.0)" - self.assertEqual(ast_to_source_code(node).strip(), source) + source = "a = paddle.fluid.layers.fill_constant(shape=[1], dtype='float64', value=1.0, name='a')" + self.assertEqual( + ast_to_source_code(node).replace('\n', '').replace(' ', ''), + source.replace(' ', '')) node = create_fill_constant_node("b", True) - source = "b = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=True)" - self.assertEqual(ast_to_source_code(node).strip(), source) + source = "b = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=True, name='b')" + self.assertEqual( + ast_to_source_code(node).replace('\n', '').replace(' ', ''), + source.replace(' ', '')) node = create_fill_constant_node("c", 4293) - source = "c = paddle.fluid.layers.fill_constant(shape=[1], dtype='int64', value=4293)" - self.assertEqual(ast_to_source_code(node).strip(), source) + source = "c = paddle.fluid.layers.fill_constant(shape=[1], dtype='int64', value=4293, name='c')" + self.assertEqual( + ast_to_source_code(node).replace('\n', '').replace(' ', ''), + source.replace(' ', '')) self.assertIsNone(create_fill_constant_node("e", None)) self.assertIsNone(create_fill_constant_node("e", [])) From 127488ba91fb5a9ead32cce93a23ec3750fcc90e Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 20 Oct 2021 10:19:24 +0800 Subject: [PATCH 036/116] Add kQueueSync.synchronize_run_ logic (#36546) --- .../fluid/framework/new_executor/interpretercore.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 083d989cb5267..f6157367cd4e2 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -410,13 +410,14 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { [&, next_id] { RunInstructionAsync(next_id); }); } } - - for (size_t i = 0; i < next_instr.direct_run_.size(); ++i) { - auto next_id = next_instr.direct_run_[i]; + auto direct_run_ops = interpretercore::merge_vector( + next_instr.synchronize_run_, next_instr.direct_run_); + size_t first_op = 0; + for (auto next_id : direct_run_ops) { if (IsReady(next_id)) { // only keep one op running in current thread - if (i == 0) { - RunInstructionAsync(next_id); + if (first_op == 0) { + first_op = next_id; continue; } // move rest ops into other threads @@ -425,6 +426,7 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { [&, next_id] { RunInstructionAsync(next_id); }); } } + if (first_op != 0) RunInstructionAsync(first_op); } } From 797bd40d093189ce3c9f24fcd0f59bbe2878b2ca Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Wed, 20 Oct 2021 10:23:35 +0800 Subject: [PATCH 037/116] [Auto Parallel] Generalization for Partition and Completion (#35735) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * default dist op * add dist_attr for dist op * add unitest * update inputname * update function name * add unitest * update CMakeLists.txt for CI * fix dis_matmul * fix compile error * update matmul to matmul_v2 * unify api * unify api * todo * update distop forward func * update distop forward func * auto parallel backward * update dist op * autoparallel backward * add backward for embedding * temp1 * temp2 * temp3 * temp4 * backward done1 * backward done2 * backward done3 * dist embedding remove mp mode * dist matmul remove mp mode * update dist embedding 怎 * dist op init1 * dist op init 2 * update unitest * context remove parallel mode * partitioner remove parallel mode * update unitest * a more general method to support varying mesh in pipeline parallel * support varying mesh in pipeline parallel * embedding support varying mesh in pipeline parallel * matmul support varying mesh in pipeline parallel * default dist op support varying mesh in pipeline parallel * dist attribute for startup program * default dist op support varying mesh in pipeline parallel 2 * partitoner support varying mesh in pipeline parallel * revise logic for auto compeletion * revise framework.py * revise reshard unitest * revise unitest for parallelize * chmod * fixed bug for dist embedding name mapping Co-authored-by: zhaoyingli --- .../distributed/auto_parallel/completion.py | 269 +++--- .../distributed/auto_parallel/context.py | 125 ++- .../auto_parallel/operators/__init__.py | 1 + .../auto_parallel/operators/common.py | 6 +- .../auto_parallel/operators/dist_default.py | 247 +++++ .../auto_parallel/operators/dist_embedding.py | 331 ++++--- .../auto_parallel/operators/dist_matmul.py | 911 +++++++++++------- .../auto_parallel/operators/dist_reshape.py | 288 +++--- .../auto_parallel/operators/dist_softmax.py | 6 + .../auto_parallel/operators/dist_transpose.py | 6 + .../distributed/auto_parallel/parallelizer.py | 4 +- .../distributed/auto_parallel/partitioner.py | 414 ++++---- .../paddle/distributed/auto_parallel/utils.py | 45 +- python/paddle/fluid/backward.py | 13 +- .../fluid/tests/unittests/CMakeLists.txt | 3 + .../unittests/auto_parallel_parallelizer.py | 140 +++ .../test_auto_parallel_parallelizer.py | 126 +-- .../test_auto_parallel_partitioner.py | 100 +- .../test_auto_parallel_partitioner_gpt.py | 30 +- .../unittests/test_auto_parallel_reshard.py | 7 +- .../test_auto_parallel_reshard_dpmppp.py | 2 - .../test_auto_parallel_reshard_mppp.py | 2 - 22 files changed, 1896 insertions(+), 1180 deletions(-) create mode 100755 python/paddle/distributed/auto_parallel/operators/dist_default.py mode change 100644 => 100755 python/paddle/distributed/auto_parallel/operators/dist_embedding.py create mode 100755 python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 3fdbad6950db5..855eb656bd90e 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -24,6 +24,7 @@ from .context import get_default_distributed_context from .operators import find_best_compatible_distributed_operator_impl from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute +from paddle.distributed.fleet.meta_optimizers.common import OpRole ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"] @@ -600,7 +601,7 @@ def sort_key_fun(node): return program -def complete_backward_annotation(auto_parallel_main_prog, dist_context): +def complete_backward_annotation(auto_parallel_main_prog, dist_context=None): """Complete the annotation of vars and ops in the backward phase for parallel program.""" def _is_grad_var_name(name): @@ -608,24 +609,44 @@ def _is_grad_var_name(name): return True return False - grad_start_idx = None + def _get_forward_varname_from_grad_varname(grad_var_name): + assert _is_grad_var_name( + grad_var_name), "[{}] is not a grad varnme.".format(grad_var_name) + return grad_var_name[:grad_var_name.find("@GRAD")] + + def _get_op_by_id(ops, id): + for op in ops: + if op.desc.id() == id: + return op + return None + + if dist_context is None: + dist_context = get_default_distributed_context() + + grad_start_idx = -1 for idx, op in enumerate(auto_parallel_main_prog.global_block().ops): - for var_name in op.output_arg_names: - # TODO: use _is_loss_op to judge - if "@GRAD" in var_name and op.type == "fill_constant": - grad_start_idx = idx - break - assert grad_start_idx is not None, "No backward procedure found in this program." + if int(op.attr('op_role')) == int( + int(core.op_proto_and_checker_maker.OpRole.Backward) | int( + core.op_proto_and_checker_maker.OpRole.Loss)): + assert op.type == "fill_constant" + grad_start_idx = idx + break + + assert grad_start_idx >= 0, "No backward procedure found in this program." ops = list(auto_parallel_main_prog.global_block().ops) vars = auto_parallel_main_prog.global_block().vars + for idx in range(grad_start_idx, len(ops)): - # complete the loss op + + # complete the initial grad loss op if idx == grad_start_idx: grad_var = vars[ops[idx].output_arg_names[0]] - grad_var_name = grad_var.name - forward_var_name = grad_var_name[:grad_var_name.find("@GRAD")] + forward_var_name = _get_forward_varname_from_grad_varname( + grad_var.name) forward_var = vars[forward_var_name] + + # TODO complete other attribte for grad var tensor_attr = TensorDistributedAttribute(grad_var, dist_context) process_mesh = dist_context.get_tensor_distributed_attr_for_program( forward_var).get_process_mesh() @@ -635,39 +656,31 @@ def _is_grad_var_name(name): tensor_attr.set_process_mesh(process_mesh) dist_context.set_tensor_distributed_attr_for_program(grad_var, tensor_attr) + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) op_attr.set_process_mesh(process_mesh) dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) - - # in the data parallel mode, the loss op followed by scale op. - if ops[idx + 1].type == "scale" and grad_var_name in ops[idx + 1].input_arg_names \ - and grad_var_name in ops[idx + 1].output_arg_names: - op_attr = OperatorDistributedAttribute(ops[idx + 1], - dist_context) - op_attr.set_process_mesh(process_mesh) - dist_context.set_op_distributed_attr_for_program(ops[idx + 1], - op_attr) continue - # complete the annotation of the optimizer op. - # TODO: use _is_optimizer_op to judge - if "Grad" in ops[idx].input_names and "Param" in ops[idx].input_names: - assert len(ops[idx].input( - "Param")) == 1, "Only support one-to-one now." - assert len(ops[idx].input( - "Grad")) == 1, "Only support one-to-one now." - var = vars[ops[idx].input("Param")[0]] - grad_var = vars[ops[idx].input("Grad")[0]] + # TODO remove this when dist op handle its own grad scale + # in the data parallel mode, the loss op followed by scale op. + if ops[idx].type == "scale" and idx == grad_start_idx + 1: + assert grad_var.name in ops[ + idx].input_arg_names and grad_var.name in ops[ + idx].output_arg_names + grad_var = vars[ops[idx].output_arg_names[0]] + forward_var_name = _get_forward_varname_from_grad_varname( + grad_var.name) + forward_var = vars[forward_var_name] process_mesh = dist_context.get_tensor_distributed_attr_for_program( - var).get_process_mesh() - dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - var).get_dims_mapping() + forward_var).get_process_mesh() op_attr = OperatorDistributedAttribute(ops[idx], dist_context) op_attr.set_process_mesh(process_mesh) - op_attr.set_input_dims_mapping(grad_var.name, dims_mapping) dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) continue + # TODO remove this when dist op handle its own communication + # TODO should distinguish the dp allreduce and mp allreduce # complete the c_allreduce_sum op for gradient in the data parallel mode. if ops[idx].type == "c_allreduce_sum" and ops[ idx].input_arg_names == ops[idx].output_arg_names: @@ -679,91 +692,123 @@ def _is_grad_var_name(name): dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) continue - # complete the annotation of grad op + # complete the annotation of grad op (xxx_grad op or sum op) grad_op = ops[idx] - for i, op in enumerate(ops[:grad_start_idx]): - match_op = None - grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc, - set(), - []) - grad_op_input = [] - for input_arg_name in grad_op.desc.input_arg_names(): - if "@GRAD" in input_arg_name: - name = input_arg_name[:input_arg_name.find("@GRAD") + 5] - grad_op_input.append(name) - else: - grad_op_input.append(input_arg_name) - - # like sum op: the count of grad op will larger than 1 - if len(grad_op_desc_list) > 1: - for grad_op_desc in grad_op_desc_list: - if grad_op_input == grad_op_desc.input_arg_names() \ - and grad_op.desc.type() == grad_op_desc.type(): - match_op = op - break - elif len(grad_op_desc_list) == 1: - if grad_op_input == grad_op_desc_list[0].input_arg_names() \ - and grad_op.desc.type() == grad_op_desc_list[0].type(): - match_op = op - - if match_op is not None: - op_attr = dist_context.get_op_distributed_attr_for_program(op) - grad_op_attr = OperatorDistributedAttribute(grad_op, - dist_context) - grad_op_attr.set_process_mesh(op_attr.get_process_mesh()) - for var_name in grad_op.input_arg_names: - if "@GRAD" in var_name: - dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - vars[var_name]).get_dims_mapping() - grad_op_attr.set_input_dims_mapping(var_name, - dims_mapping) - else: - dims_mapping = op_attr.get_input_dims_mapping(var_name) - grad_op_attr.set_input_dims_mapping(var_name, - dims_mapping) - dist_context.set_op_distributed_attr_for_program(grad_op, - grad_op_attr) - - for var_name in grad_op.output_arg_names: - if "@GRAD" in var_name: - forward_var = vars[var_name[:var_name.find("@GRAD")]] - tensor_attr = TensorDistributedAttribute(vars[var_name], - dist_context) - process_mesh = grad_op_attr.get_process_mesh() - dims_mapping = grad_op_attr.get_input_dims_mapping( - forward_var.name) - tensor_attr.set_process_mesh(process_mesh) - tensor_attr.set_dims_mapping(dims_mapping) - dist_context.set_tensor_distributed_attr_for_program( - vars[var_name], tensor_attr) - break - - # complete the annotation of sum op for multiple renamed grad var - if grad_op.type == "sum" and all( - map(_is_grad_var_name, grad_op.input_arg_names)): - assert len(grad_op.output_arg_names - ) == 1, "The output count of sum op should be one." + + # xxx_grad op will have a corresponding forward op in gradopidx2opidx + dist_op_helper = dist_context.get_dist_op_helper() + if grad_op.desc.id() in dist_op_helper.gradopidx2opidx: + # TODO support the case where one forward op corresponding to multiple xxx_grad op + forward_op = _get_op_by_id( + ops[:grad_start_idx], + dist_op_helper.gradopidx2opidx[grad_op.desc.id()]) + assert forward_op is not None + + # op dist attr + forward_op_attr = dist_context.get_op_distributed_attr_for_program( + forward_op) grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context) + grad_op_attr.set_process_mesh(forward_op_attr.get_process_mesh()) + for var_name in grad_op.input_arg_names: if "@GRAD" in var_name: - forward_var = vars[var_name[:var_name.find("@GRAD")]] dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_dims_mapping() + vars[var_name]).get_dims_mapping() + grad_op_attr.set_input_dims_mapping(var_name, dims_mapping) + else: + dims_mapping = forward_op_attr.get_input_dims_mapping( + var_name) + # TODO fixed here + if dims_mapping == None: + dims_mapping = forward_op_attr.get_output_dims_mapping( + var_name) + assert dims_mapping is not None, "[{}]'s dims_mapping is None".format( + var_name) grad_op_attr.set_input_dims_mapping(var_name, dims_mapping) + dist_context.set_op_distributed_attr_for_program(grad_op, + grad_op_attr) + # var dist attr for var_name in grad_op.output_arg_names: - forward_var = vars[var_name[:var_name.find("@GRAD")]] - tensor_attr = TensorDistributedAttribute(vars[var_name], - dist_context) - process_mesh = dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_process_mesh() - dims_mapping = dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_dims_mapping() - tensor_attr.set_dims_mapping(dims_mapping) - tensor_attr.set_process_mesh(process_mesh) - dist_context.set_tensor_distributed_attr_for_program( - vars[var_name], tensor_attr) - grad_op_attr.set_process_mesh( - dist_context.get_tensor_distributed_attr_for_program( - forward_var).get_process_mesh()) + if _is_grad_var_name(var_name): + + forward_var_name = _get_forward_varname_from_grad_varname( + var_name) + forward_var = vars[forward_var_name] + tensor_attr = TensorDistributedAttribute(vars[var_name], + dist_context) + process_mesh = grad_op_attr.get_process_mesh() + dims_mapping = grad_op_attr.get_input_dims_mapping( + forward_var_name) + tensor_attr.set_process_mesh(process_mesh) + tensor_attr.set_dims_mapping(dims_mapping) + dist_context.set_tensor_distributed_attr_for_program( + vars[var_name], tensor_attr) + + # only sum op for merge mutiple version grad has no a corresponding mapping in gradopidx2opidx + else: + assert grad_op.type == "sum", "got unexpect op [{}]".format( + str(grad_op.type)) + assert all(map(_is_grad_var_name, grad_op.input_arg_names)) + assert len(grad_op.output_arg_names) == 1 + + ref_forward_var_name = _get_forward_varname_from_grad_varname( + grad_op.output_arg_names[0]) + forward_var = vars[ref_forward_var_name] + ref_forward_var_dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + ref_forward_var_process_mesh = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh() + + # output + tensor_attr = TensorDistributedAttribute( + vars[grad_op.output_arg_names[0]], dist_context) + tensor_attr.set_dims_mapping(ref_forward_var_dims_mapping) + tensor_attr.set_process_mesh(ref_forward_var_process_mesh) + dist_context.set_tensor_distributed_attr_for_program( + vars[grad_op.output_arg_names[0]], tensor_attr) + + # op + grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context) + grad_op_attr.set_process_mesh(ref_forward_var_process_mesh) + for var_name in grad_op.input_arg_names: + assert _get_forward_varname_from_grad_varname( + var_name) == ref_forward_var_name + grad_op_attr.set_input_dims_mapping( + var_name, ref_forward_var_dims_mapping) dist_context.set_op_distributed_attr_for_program(grad_op, grad_op_attr) + + +def complete_update_annotation(auto_parallel_main_prog, dist_context): + """Complete the annotation of vars and ops in the update phase for parallel program.""" + + if dist_context is None: + dist_context = get_default_distributed_context() + + ops = list(auto_parallel_main_prog.global_block().ops) + vars = auto_parallel_main_prog.global_block().vars + + for idx in range(len(ops)): + + # complete the annotation of the optimizer op. + # TODO to add attribute for moment var + if int(ops[idx].attr('op_role')) == int(OpRole.Optimize): + if "Grad" in ops[idx].input_names and "Param" in ops[ + idx].input_names: + assert len(ops[idx].input( + "Param")) == 1, "Only support one-to-one now." + assert len(ops[idx].input( + "Grad")) == 1, "Only support one-to-one now." + param = vars[ops[idx].input("Param")[0]] + grad_var = vars[ops[idx].input("Grad")[0]] + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + param).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + param).get_dims_mapping() + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + op_attr.set_process_mesh(process_mesh) + op_attr.set_input_dims_mapping(grad_var.name, dims_mapping) + op_attr.set_input_dims_mapping(param.name, dims_mapping) + dist_context.set_op_distributed_attr_for_program(ops[idx], + op_attr) + continue diff --git a/python/paddle/distributed/auto_parallel/context.py b/python/paddle/distributed/auto_parallel/context.py index 5e6565aa3d84c..6785f21351aa4 100644 --- a/python/paddle/distributed/auto_parallel/context.py +++ b/python/paddle/distributed/auto_parallel/context.py @@ -51,23 +51,8 @@ def __init__(self): self._op_distributed_attr_map_for_program = {} self._tensor_distributed_attr_map_for_graph = {} self._op_distributed_attr_map_for_graph = {} - # The following is a hard code and will be removed in the future - self._data_parallel_axis = None - self._model_parallel_axis = None + self._get_dist_op_helper = DistOpHelper() self._process_mesh = _g_process_mesh_map.get(0, None) - if self._process_mesh is not None: - if self._process_mesh.ndim == 1: - self._data_parallel_axis = 0 - self._model_parallel_axis = 0 - elif self._process_mesh.ndim == 3: - self._data_parallel_axis = 1 - self._model_parallel_axis = 2 - else: - self._data_parallel_axis = 0 - self._model_parallel_axis = 1 - else: - self._data_parallel_axis = -1 - self._model_parallel_axis = -1 def is_initialized_for_program(self): return self._is_initialized_for_program @@ -120,16 +105,9 @@ def set_op_distributed_attr_for_graph(self, op_node, op_dist_attr): def set_process_mesh(self, process_mesh): self._process_mesh = process_mesh - if self._process_mesh is not None: - if self._process_mesh.ndim == 1: - self._data_parallel_axis = 0 - self._model_parallel_axis = 0 - else: - self._data_parallel_axis = 0 - self._model_parallel_axis = 1 - else: - self._data_parallel_axis = -1 - self._model_parallel_axis = -1 + + def get_dist_op_helper(self): + return self._get_dist_op_helper def initialize_distributed_attr_for_program(self, program): if self._is_initialized_for_program: @@ -425,10 +403,93 @@ def amend_distributed_attr_for_program(self): and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]: dims_mapping[i] = -1 - def _get_data_parallel_info(self): - # This function is a hard code, and will be obsoleted in the future - return self._data_parallel_axis, self._process_mesh - def _get_model_parallel_info(self): - # This function is a hard code, and will be obsoleted in the future - return self._model_parallel_axis, self._process_mesh +class DistOpHelper: + """ + DistOpHelper is used to create a dist op desc in Program. + Every time to create a new dist op, the context should be updated for it accordingly. + """ + + def __init__(self): + self._dst_main_program = None + self._dst_startup_program = None + self._varname_mapping = None + self._rank_id = None + self._cur_src_op = None + self._cur_dist_attr = None + self.gradopidx2opidx = {} + self.already_init_sync_vars = set() + + def set_dst_main_program(self, prog): + self._dst_main_program = prog + + def get_dst_main_program(self): + return self._dst_main_program + + def set_dst_startup_program(self, prog): + self._dst_startup_program = prog + + def get_dst_startup_program(self): + return self._dst_startup_program + + def set_varname_mapping(self, mapping): + self._varname_mapping = mapping + + def get_varname_mapping(self): + return self._varname_mapping + + def set_rank_id(self, rank_id): + self._rank_id = rank_id + + def get_rank_id(self): + return self._rank_id + + def set_cur_src_op(self, cur_src_op): + self._cur_src_op = cur_src_op + + def get_cur_src_op(self): + return self._cur_src_op + + def prepare_forward_context(self, src_op): + + self.set_cur_src_op(src_op) + + # build input varname mapping + kinputs = {} + for input_name in src_op.desc.input_names(): + varnames = [] + for varname in src_op.desc.input(input_name): + varnames.append(self._varname_mapping[varname]) + kinputs[input_name] = varnames + + # build output varname mapping + koutputs = {} + for output_name in src_op.desc.output_names(): + varnames = [] + for varname in src_op.desc.output(output_name): + varnames.append(self._varname_mapping[varname]) + koutputs[output_name] = varnames + + return kinputs, koutputs + + def prepare_backward_context(self, backward_op): + + self.set_cur_src_op(backward_op) + + # build input varname mapping + kinputs = {} + for input_name in backward_op.desc.input_names(): + varnames = [] + for varname in backward_op.desc.input(input_name): + varnames.append(varname) + kinputs[input_name] = varnames + + # build output varname mapping + koutputs = {} + for output_name in backward_op.desc.output_names(): + varnames = [] + for varname in backward_op.desc.output(output_name): + varnames.append(varname) + koutputs[output_name] = varnames + + return kinputs, koutputs diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 14ded477cb709..3b3359b4ebf1c 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -22,3 +22,4 @@ from . import dist_reshape from . import dist_softmax from . import dist_transpose +from . import dist_default diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 1b0b05d39547a..5685c40a3227b 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -36,10 +36,12 @@ def __init__(self): self._forward_implemented = False self._backward_implemented = False - def forward(self, dist_ctx, *args, **kwargs): + @staticmethod + def forward(dist_ctx, *args, **kwargs): raise NotImplementedError("Please Implement this method in Subclass.") - def backward(self, dist_ctx, *grad_outputs): + @staticmethod + def backward(dist_ctx, *grad_outputs, **kwargs): raise NotImplementedError("Please Implement this method in Subclass.") def get_name(self): diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py new file mode 100755 index 0000000000000..cf17b7afb0f39 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -0,0 +1,247 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .common import DistributedOperator +from .common import DistributedOperatorImpl +from .common import register_distributed_operator +from .common import register_distributed_operator_impl +from ..utils import is_dim_shard +from ..utils import is_dim_replicate +from ..utils import is_valid_list_index +from ..utils import compute_compatible_dim_mapping +from ..utils import compute_compatible_dims_mapping +from ..utils import compute_compatible_and_update_dim_mapping +from ..attribute import OperatorDistributedAttribute +from paddle.fluid import core, unique_name +from paddle.fluid.framework import in_dygraph_mode +from paddle.fluid.framework import Program, Parameter, Variable, program_guard +from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY +from ..process import new_process_group +from ..utils import _get_comm_group, _get_corresponding_rank + + +class DistributedDefault(DistributedOperator): + def __init__(self, name): + super(DistributedDefault, self).__init__() + self._name = name + + +register_distributed_operator("default", DistributedDefault("default")) + + +# Replicated Default +class DistributedDefaultImpl0(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedDefaultImpl0, self).__init__() + self._name = name + self._forward_implemented = True + self._backward_implemented = True + + def is_process_mesh_compatible(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + def is_input_compatible(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + def is_output_compatible(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + def update_dims_mapping(self, op_dist_attr): + raise NotImplementedError("Please Implement this method.") + + @staticmethod + def forward(ctx, *args, **kwargs): + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + varname_mapping = dist_op_helper.get_varname_mapping() + rank_id = dist_op_helper.get_rank_id() + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + # replicate op in dist program + dist_op_desc = main_block.desc.append_op() + dist_op_desc.copy_from(src_op.desc) + for input_name in src_op.desc.input_names(): + dist_op_desc.set_input(input_name, kwargs[input_name]) + for output_name in src_op.desc.output_names(): + dist_op_desc.set_output(output_name, kwargs[output_name]) + + main_block._sync_with_cpp() + + # param initialization sync + for varname in dist_op_desc.input_arg_names(): + if startup_block.has_var(varname) and startup_block.var( + varname + ).is_parameter and varname not in dist_op_helper.already_init_sync_vars: + dist_op_helper.already_init_sync_vars.add(varname) + param = startup_block.var(varname) + param_dist_attr = ctx.get_tensor_distributed_attr_for_program( + param) + process_mesh = param_dist_attr.get_process_mesh() + dims_mapping = param_dist_attr.get_dims_mapping() + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in process_mesh.process_group: + rank_id = _get_corresponding_rank(process_mesh, rank_id) + + # NOTE all not splited axis should be presented in mesh + for axis, size in enumerate(process_mesh.topology): + if size <= 1 or axis in dims_mapping: + pass + else: + group_ranks = _get_comm_group( + process_mesh.process_group, process_mesh.topology, + axis, rank_id) + sync_group = new_process_group(group_ranks) + + new_op = startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': sync_group.id, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + + # set distributed attribute + op_attr = OperatorDistributedAttribute(new_op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(param.name, + dims_mapping) + op_attr.set_input_dims_mapping(param.name, dims_mapping) + ctx.set_op_distributed_attr_for_program(new_op, op_attr) + + startup_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + + # by now the backward function only insert the gradient allreduce for dist op itself + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + backward_op = dist_op_helper.get_cur_src_op() + dist_attr = ctx.get_op_distributed_attr_for_program(backward_op) + assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(backward_op)) + rank_id = dist_op_helper.get_rank_id() + + # check if need gradient allreduce + # if there is a non-gradient & non-parameter input and its batch dimension is splited, + # we need insert gradient allreduce for the gradient of parameter in its output + need_gradient_allreduce = False + for input_name in backward_op.desc.input_names(): + for varname in backward_op.desc.input(input_name): + if "@GRAD" not in varname and not main_block.var( + varname).is_parameter: + + # NOTE input var's dim_mapping of backward op should be the same with input var instead of corresponding varname of forward op + process_mesh = dist_attr.get_process_mesh() + var_dim_mapping = dist_attr.get_input_dims_mapping(varname) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in process_mesh.process_group: + rank_id = _get_corresponding_rank(process_mesh, rank_id) + + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: + need_gradient_allreduce = True + group_ranks = _get_comm_group( + process_mesh.process_group, process_mesh.topology, + batch_size_axis, rank_id) + dp_degree = len(group_ranks) + dp_group = new_process_group(group_ranks) + break + + if need_gradient_allreduce: + allreduce_vars = [] + for input_name in backward_op.desc.input_names(): + for varname in backward_op.desc.input(input_name): + if "@GRAD" not in varname and main_block.var( + varname).is_parameter: + assert len( + backward_op.desc.input(input_name) + ) == 1, "parameter input to grad op should be length 1, but got [{}]".format( + backward_op.desc.input(input_name)) + + assert varname + "@GRAD" in backward_op.desc.output_arg_names( + ), "parameter's grad [{}] not found in the grad op's output".format( + varname + "@GRAD") + assert len( + backward_op.desc.output(input_name + "@GRAD") + ) == 1, "parameter grad of grad op should be length 1, but got [{}]".format( + backward_op.desc.output(input_name + "@GRAD")) + allreduce_vars.append( + backward_op.desc.output(input_name + "@GRAD")[0]) + + if len(allreduce_vars) > 0: + + for varname in allreduce_vars: + + grad_var = main_block.var(varname) + allreduce_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': [grad_var]}, + outputs={'Out': [grad_var]}, + attrs={ + 'ring_id': dp_group.id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Backward + }) + + scale_op = main_block.append_op( + type='scale', + inputs={'X': grad_var}, + outputs={'Out': grad_var}, + attrs={ + 'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward + }) + + dims_mapping = ctx.get_tensor_distributed_attr_for_program( + grad_var).get_dims_mapping() + process_mesh = dist_attr.get_process_mesh() + for op in [allreduce_op, scale_op]: + op_attr = OperatorDistributedAttribute(op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(grad_var.name, + dims_mapping) + op_attr.set_input_dims_mapping(grad_var.name, + dims_mapping) + ctx.set_op_distributed_attr_for_program(op, op_attr) + + main_block._sync_with_cpp() + + +register_distributed_operator_impl( + "default", DistributedDefaultImpl0("replicate_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py old mode 100644 new mode 100755 index 3f8fbf9cc3a7a..cd6d2255c81f1 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -24,12 +24,14 @@ from ..utils import compute_compatible_dim_mapping from ..utils import compute_compatible_dims_mapping from ..utils import compute_compatible_and_update_dim_mapping +from ..attribute import OperatorDistributedAttribute from paddle.fluid import core, unique_name from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import Program, Parameter, Variable, program_guard from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY from ..process import new_process_group -from ..utils import _get_comm_group +from ..utils import _get_comm_group, _get_idx_in_axis, _get_corresponding_rank class DistributedEmbedding(DistributedOperator): @@ -40,6 +42,7 @@ def __init__(self, name): register_distributed_operator("lookup_table_v2", DistributedEmbedding("embedding")) +register_distributed_operator("c_embedding", DistributedEmbedding("embedding")) # RowParallel @@ -48,7 +51,7 @@ def __init__(self, name): super(DistributedEmbeddingImpl, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -102,127 +105,231 @@ def update_dims_mapping(self, op_dist_attr): return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "row_parallel_embedding take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "row_parallel_embedding take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['Ids'] - ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( - input_name_mapping['Ids']) - assert len( - input_name_mapping['W'] - ) == 1, "row_parallel_embedding input W take 1 variable but got {}".format( - input_name_mapping['W']) - assert len( - output_name_mapping['Out'] - ) == 1, "row_parallel_embedding input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - - Ids_var = dst_block.var(input_name_mapping['Ids'][0]) - Weight_var = dst_block.var(input_name_mapping['W'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # got dist attribute info - embedding_row_dim_mapping = op_dist_attr.get_input_dims_mapping( - Weight_var.name)[0] - process_mesh_shape = op_dist_attr.get_process_mesh().topology - process_mesh_group = op_dist_attr.get_process_mesh().process_group - - # caculate embedding offset - # TODO generalize here, using cartisian product to allow any dimensional mesh shape - mesh_shape = len(process_mesh_shape) - assert mesh_shape <= 2, "row_parallel_embedding only support 1 or 2 dimensional process mesh, but got {}".format( - process_mesh_shape) - num_partition = process_mesh_shape[embedding_row_dim_mapping] - # TODO generalize here, support any mesh group - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - if mesh_shape == 1: - if rank_id not in process_mesh_group: - assert len( - process_mesh.topology - ) == 2, " row_parallel_embedding process mapping only support 2 dimensional process mesh, \ - but got {}".format(len(process_mesh.topology)) - rank_id = process_mesh_group[ - process_mesh.process_group.index(rank_id) % - process_mesh_shape[0]] - relative_idx = process_mesh_group.index(rank_id) + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # check validation of inputs / outputs + assert 'Ids' in kwargs, "input [{}] is not given".format('Ids') + assert 'W' in kwargs, "input [{}] is not given".format('W') + assert 'Out' in kwargs, "output [{}] is not given".format('Out') + + assert len( + kwargs['Ids'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Ids']) + assert len( + kwargs['W'] + ) == 1, "row_parallel_embedding input W take 1 variable but got {}".format( + kwargs['W']) + assert len( + kwargs['Out'] + ) == 1, "row_parallel_embedding output Out take 1 variable but got {}".format( + kwargs['Out']) + + Ids_var = main_block.var(kwargs['Ids'][0]) + Weight_var = main_block.var(kwargs['W'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # got dist attribute info + embedding_row_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[0] + assert embedding_row_dim_mapping >= 0, "row_parallel_embedding's row should be divided by a specific mesh axis, but got [{}]".format( + embedding_row_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in process_mesh_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # A generalized method to caculate embedding offset using cartisian product + relative_idx = _get_idx_in_axis(process_mesh_group, process_mesh_shape, + embedding_row_dim_mapping, rank_id) + + per_part_size = Weight_var.shape[0] + relative_idx = relative_idx * per_part_size + + # TODO caculate ring id + parallel_axis = embedding_row_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + # append op + check_variable_and_dtype(Ids_var, 'input', ['int32', 'int64'], + 'c_embedding') + + intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_embedding", 'tmp'])), + dtype=Weight_var.dtype, + shape=Out_var.shape, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=Out_var.stop_gradient) + + # copy Out_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var) + + check_variable_and_dtype( + Out_var, 'tensor', + ['float16', 'float32', 'float64', 'int32', 'int64'], + 'c_allreduce_sum') + + c_embedding_op = main_block.append_op( + type='c_embedding', + inputs={'Ids': [Ids_var], + 'W': [Weight_var]}, + outputs={'Out': [intermediate_var_0]}, + attrs={"start_index": relative_idx}) + + # use_model_parallel + c_allreduce_sum_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': [intermediate_var_0]}, + outputs={'Out': [Out_var]}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True, + }) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(c_embedding_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block, + op_dist_attr) + + # param initialization sync + assert Weight_var.name not in dist_op_helper.already_init_sync_vars + dist_op_helper.already_init_sync_vars.add(Weight_var.name) + param = startup_block.var(Weight_var.name) + param_dist_attr = ctx.get_tensor_distributed_attr_for_program(param) + process_mesh = param_dist_attr.get_process_mesh() + dim_mapping = param_dist_attr.get_dims_mapping() + + # NOTE all not splited axis should be presented in mesh + for axis, size in enumerate(process_mesh.topology): + if size <= 1 or axis in dim_mapping: + pass else: - relative_idx = rank_id % num_partition + group_ranks = _get_comm_group(process_mesh.process_group, + process_mesh.topology, axis, + rank_id) + sync_group = new_process_group(group_ranks) + + startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': sync_group.id, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + startup_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + + # by now the backward function only insert the gradient allreduce for dist op itself + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + backward_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + dist_attr = ctx.get_op_distributed_attr_for_program(backward_op) + assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(backward_op)) - per_part_size = Weight_var.shape[0] - relative_idx = relative_idx * per_part_size + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(dist_attr.get_process_mesh(), + rank_id) + + # check if need gradient allreduce + need_gradient_allreduce = False + + assert 'Ids' in kwargs, "input [{}] is not given".format('Ids') + assert 'W' in kwargs, "input [{}] is not given".format('W') + assert 'Out@GRAD' in kwargs, "input [{}] is not given".format('Out') + assert 'W@GRAD' in kwargs, "output [{}] is not given".format('W@GRAD') + + assert len( + kwargs['Ids'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Ids']) + assert len( + kwargs['W'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['W']) + assert len( + kwargs['Out@GRAD'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Out']) + assert len( + kwargs['W@GRAD'] + ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format( + kwargs['W@GRAD']) + + Ids_var = main_block.var(kwargs['Ids'][0]) + process_mesh = dist_attr.get_process_mesh() + var_dim_mapping = dist_attr.get_input_dims_mapping(Ids_var.name) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: + need_gradient_allreduce = True - # TODO caculate ring id group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - # append op - check_variable_and_dtype(Ids_var, 'input', ['int32', 'int64'], - 'c_embedding') - - intermediate_var_0 = dst_block.create_var( - name=unique_name.generate_with_ignorable_key(".".join( - ["c_embedding", 'tmp'])), - dtype=Weight_var.dtype, - shape=Out_var.shape, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=Out_var.stop_gradient) - # copy Out_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - Out_var) - - check_variable_and_dtype( - Out_var, 'tensor', - ['float16', 'float32', 'float64', 'int32', 'int64'], - 'c_allreduce_sum') - - c_embedding_op = dst_block.append_op( - type='c_embedding', - inputs={'Ids': [Ids_var], - 'W': [Weight_var]}, - outputs={'Out': [intermediate_var_0]}, - attrs={"start_index": relative_idx}) - - # use_model_parallel - c_allreduce_sum_op = dst_block.append_op( + batch_size_axis, rank_id) + dp_degree = len(group_ranks) + dp_group = new_process_group(group_ranks) + + if need_gradient_allreduce: + W_Grad_var = main_block.var(kwargs['W@GRAD'][0]) + allreduce_op = main_block.append_op( type='c_allreduce_sum', - inputs={'X': [intermediate_var_0]}, - outputs={'Out': [Out_var]}, + inputs={'X': [W_Grad_var]}, + outputs={'Out': [W_Grad_var]}, attrs={ - 'ring_id': group.id, + 'ring_id': dp_group.id, 'use_calc_stream': True, - 'use_model_parallel': True, + OP_ROLE_KEY: OpRole.Backward }) + scale_op = main_block.append_op( + type='scale', + inputs={'X': W_Grad_var}, + outputs={'Out': W_Grad_var}, + attrs={'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward}) + main_block._sync_with_cpp() - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(c_embedding_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(c_allreduce_sum_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + dims_mapping = ctx.get_tensor_distributed_attr_for_program( + W_Grad_var).get_dims_mapping() + process_mesh = dist_attr.get_process_mesh() + for op in [allreduce_op, scale_op]: + op_attr = OperatorDistributedAttribute(op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(W_Grad_var.name, dims_mapping) + op_attr.set_input_dims_mapping(W_Grad_var.name, dims_mapping) + ctx.set_op_distributed_attr_for_program(op, op_attr) register_distributed_operator_impl("lookup_table_v2", DistributedEmbeddingImpl("row_parallel")) +register_distributed_operator_impl("c_embedding", + DistributedEmbeddingImpl("row_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 10a01dc57ed2b..2edbcd2318cdf 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -24,12 +24,14 @@ from ..utils import compute_compatible_dim_mapping from ..utils import compute_compatible_dims_mapping from ..utils import compute_compatible_and_update_dim_mapping +from ..attribute import OperatorDistributedAttribute from paddle.fluid import core, unique_name from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import Program, Parameter, Variable, program_guard from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY from ..process import new_process_group -from ..utils import _get_comm_group +from ..utils import _get_comm_group, _get_corresponding_rank def _update_dims_mapping_for_matmul(op_dist_attr): @@ -123,6 +125,130 @@ def _update_dims_mapping_for_matmul(op_dist_attr): return changed +def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): + + # by now the backward function only insert the gradient allreduce for dist op itself + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + backward_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + dist_attr = ctx.get_op_distributed_attr_for_program(backward_op) + assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(backward_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(dist_attr.get_process_mesh(), rank_id) + + # check if need gradient allreduce + need_gradient_allreduce = False + + assert 'Y' in kwargs, "input [{}] is not given".format('Y') + assert 'X' in kwargs, "input [{}] is not given".format('X') + assert 'Out@GRAD' in kwargs, "input [{}] is not given".format('Out@GRAD') + assert 'Y@GRAD' in kwargs, "output [{}] is not given".format('Y@GRAD') + assert 'X@GRAD' in kwargs, "output [{}] is not given".format('X@GRAD') + + assert len( + kwargs['Y'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Y']) + assert len( + kwargs['X'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['X']) + assert len( + kwargs['Out@GRAD'] + ) == 1, "row_parallel_embedding input Ids take 1 variable but got {}".format( + kwargs['Out']) + assert len( + kwargs['Y@GRAD'] + ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format( + kwargs['Y@GRAD']) + assert len( + kwargs['X@GRAD'] + ) == 1, "row_parallel_embedding output Ids take 1 variable but got {}".format( + kwargs['X@GRAD']) + + X_var = main_block.var(kwargs['X'][0]) + assert not X_var.is_parameter, "left operand(X) [{}] of dist matmul should not be parameter".format( + X_var.name) + + process_mesh = dist_attr.get_process_mesh() + var_dim_mapping = dist_attr.get_input_dims_mapping(X_var.name) + mesh_shape = process_mesh.topology + batch_size_axis = var_dim_mapping[0] + if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1: + need_gradient_allreduce = True + group_ranks = _get_comm_group(process_mesh.process_group, + process_mesh.topology, batch_size_axis, + rank_id) + dp_degree = len(group_ranks) + dp_group = new_process_group(group_ranks) + + Y_var = main_block.var(kwargs['Y'][0]) + if need_gradient_allreduce and Y_var.is_parameter: + Y_Grad_var = main_block.var(kwargs['Y@GRAD'][0]) + allreduce_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': [Y_Grad_var]}, + outputs={'Out': [Y_Grad_var]}, + attrs={ + 'ring_id': dp_group.id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Backward + }) + scale_op = main_block.append_op( + type='scale', + inputs={'X': Y_Grad_var}, + outputs={'Out': Y_Grad_var}, + attrs={'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward}) + main_block._sync_with_cpp() + + dims_mapping = ctx.get_tensor_distributed_attr_for_program( + Y_Grad_var).get_dims_mapping() + process_mesh = dist_attr.get_process_mesh() + for op in [allreduce_op, scale_op]: + op_attr = OperatorDistributedAttribute(op, ctx) + op_attr.set_process_mesh(process_mesh) + op_attr.set_output_dims_mapping(Y_Grad_var.name, dims_mapping) + op_attr.set_input_dims_mapping(Y_Grad_var.name, dims_mapping) + ctx.set_op_distributed_attr_for_program(op, op_attr) + + +def _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, rank_id): + + assert Weight_var.name not in dist_op_helper.already_init_sync_vars + assert startup_block.has_var(Weight_var.name) + dist_op_helper.already_init_sync_vars.add(Weight_var.name) + param = startup_block.var(Weight_var.name) + param_dist_attr = ctx.get_tensor_distributed_attr_for_program(param) + process_mesh = param_dist_attr.get_process_mesh() + dim_mapping = param_dist_attr.get_dims_mapping() + + for axis, size in enumerate(process_mesh.topology): + if size <= 1 or axis in dim_mapping: + pass + else: + group_ranks = _get_comm_group(process_mesh.process_group, + process_mesh.topology, axis, rank_id) + sync_group = new_process_group(group_ranks) + + startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': sync_group.id, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + startup_block._sync_with_cpp() + + class DistributedMatmul(DistributedOperator): def __init__(self, name): super(DistributedMatmul, self).__init__() @@ -138,7 +264,7 @@ def __init__(self, name): super(DistributedMatmulImpl0, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -178,101 +304,109 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - intermediate_var_0 = dst_block.create_var( - name=unique_name.generate_with_ignorable_key(".".join( - ["c_identity", 'tmp'])), - dtype=X_var.dtype, - shape=X_var.shape, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=X_var.stop_gradient) - # copy X_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - X_var) - - check_variable_and_dtype( - X_var, 'tensor', - ['float16', 'float32', 'float64', 'int32', 'int64'], - '_c_identity') - - c_identity_op = dst_block.append_op( - type='c_identity', - inputs={'X': [X_var]}, - outputs={'Out': intermediate_var_0}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True, - }) - - check_variable_and_dtype(intermediate_var_0, 'x', - ['float16', 'float32', 'float64'], - 'linear') - check_dtype(intermediate_var_0.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = { - 'transpose_X': False, - 'transpose_Y': False, - 'alpha': 1, - } - inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} - matmul_op = dst_block.append_op( - type='matmul', - inputs=inputs, - outputs={'Out': Out_var}, - attrs=attrs) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(c_identity_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(matmul_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[1] + assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_col_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_identity", 'tmp'])), + dtype=X_var.dtype, + shape=X_var.shape, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=X_var.stop_gradient) + # copy X_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, X_var) + + check_variable_and_dtype( + X_var, 'tensor', + ['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity') + + c_identity_op = main_block.append_op( + type='c_identity', + inputs={'X': [X_var]}, + outputs={'Out': intermediate_var_0}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True, + }) + + check_variable_and_dtype(intermediate_var_0, 'x', + ['float16', 'float32', 'float64'], 'linear') + check_dtype(intermediate_var_0.dtype, 'dtype', + ['float16', 'float32', 'float64'], 'linear') + attrs = { + 'transpose_X': False, + 'transpose_Y': False, + 'alpha': 1, + } + inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} + matmul_op = main_block.append_op( + type='matmul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(c_identity_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(matmul_op, main_block, op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # RowParallel @@ -281,7 +415,7 @@ def __init__(self, name): super(DistributedMatmulImpl1, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -323,95 +457,108 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - check_variable_and_dtype( - X_var, 'x', ['float16', 'float32', 'float64'], 'linear') - check_dtype(X_var.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = { - 'transpose_X': False, - 'transpose_Y': False, - 'alpha': 1, - } - inputs = {'X': X_var, 'Y': Weight_var} - intermediate_var_0 = dst_block.create_var( - shape=Out_var.shape, - dtype=Out_var.dtype, - type=Out_var.type, - lod_level=Out_var.lod_level, - persistable=False, - is_data=False, - need_check_feed=Out_var.desc.need_check_feed()) - # copy Out_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - Out_var) - - matmul_op = dst_block.append_op( - type='matmul', - inputs=inputs, - outputs={'Out': intermediate_var_0}, - attrs=attrs) - - c_allreduce_sum_op = dst_block.append_op( - type='c_allreduce_sum', - inputs={'X': intermediate_var_0}, - outputs={'Out': Out_var}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True - }) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(matmul_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(c_allreduce_sum_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[0] + assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_row_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_row_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'], + 'linear') + check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], + 'linear') + attrs = { + 'transpose_X': False, + 'transpose_Y': False, + 'alpha': 1, + } + inputs = {'X': X_var, 'Y': Weight_var} + intermediate_var_0 = main_block.create_var( + shape=Out_var.shape, + dtype=Out_var.dtype, + type=Out_var.type, + lod_level=Out_var.lod_level, + persistable=False, + is_data=False, + need_check_feed=Out_var.desc.need_check_feed()) + # copy Out_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var) + + matmul_op = main_block.append_op( + type='matmul', + inputs=inputs, + outputs={'Out': intermediate_var_0}, + attrs=attrs) + + c_allreduce_sum_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': intermediate_var_0}, + outputs={'Out': Out_var}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True + }) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(matmul_op, main_block, op_dist_attr) + copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block, + op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # ReplicateParallel @@ -465,6 +612,10 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) + register_distributed_operator_impl("matmul", DistributedMatmulImpl0("column_parallel")) @@ -489,7 +640,7 @@ def __init__(self, name): super(DistributedMatmulV2Impl0, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -529,97 +680,109 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - intermediate_var_0 = dst_block.create_var( - name=unique_name.generate_with_ignorable_key(".".join( - ["c_identity", 'tmp'])), - dtype=X_var.dtype, - shape=X_var.shape, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=X_var.stop_gradient) - # copy X_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - X_var) - - check_variable_and_dtype( - X_var, 'tensor', - ['float16', 'float32', 'float64', 'int32', 'int64'], - '_c_identity') - - c_identity_op = dst_block.append_op( - type='c_identity', - inputs={'X': [X_var]}, - outputs={'Out': intermediate_var_0}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True, - }) - - check_variable_and_dtype(intermediate_var_0, 'x', - ['float16', 'float32', 'float64'], - 'linear') - check_dtype(intermediate_var_0.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = {'trans_x': False, 'trans_y': False} - inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} - matmul_v2_op = dst_block.append_op( - type='matmul_v2', - inputs=inputs, - outputs={'Out': Out_var}, - attrs=attrs) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(c_identity_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(matmul_v2_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[1] + assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_col_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_col_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_identity", 'tmp'])), + dtype=X_var.dtype, + shape=X_var.shape, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=X_var.stop_gradient) + # copy X_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, X_var) + + check_variable_and_dtype( + X_var, 'tensor', + ['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity') + + c_identity_op = main_block.append_op( + type='c_identity', + inputs={'X': [X_var]}, + outputs={'Out': intermediate_var_0}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True, + }) + + check_variable_and_dtype(intermediate_var_0, 'x', + ['float16', 'float32', 'float64'], 'linear') + check_dtype(intermediate_var_0.dtype, 'dtype', + ['float16', 'float32', 'float64'], 'linear') + attrs = {'trans_x': False, 'trans_y': False} + inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} + matmul_v2_op = main_block.append_op( + type='matmul_v2', + inputs=inputs, + outputs={'Out': Out_var}, + attrs=attrs) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(c_identity_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(matmul_v2_op, main_block, + op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # RowParallel @@ -628,7 +791,7 @@ def __init__(self, name): super(DistributedMatmulV2Impl1, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -670,91 +833,105 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 2, "col_parallel_linear take 2 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 1, "col_parallel_linear take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "col_parallel_linear input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['Y'] - ) == 1, "col_parallel_linear input Y take 1 variable but got {}".format( - input_name_mapping['Y']) - assert len( - output_name_mapping['Out'] - ) == 1, "col_parallel_linear input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - X_var = dst_block.var(input_name_mapping['X'][0]) - Weight_var = dst_block.var(input_name_mapping['Y'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - - # TODO infer logic comm presentation - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, rank_id) - group = new_process_group(group_ranks) - - check_variable_and_dtype( - X_var, 'x', ['float16', 'float32', 'float64'], 'linear') - check_dtype(X_var.dtype, 'dtype', - ['float16', 'float32', 'float64'], 'linear') - attrs = {'trans_x': False, 'trans_y': False} - inputs = {'X': X_var, 'Y': Weight_var} - intermediate_var_0 = dst_block.create_var( - shape=Out_var.shape, - dtype=Out_var.dtype, - type=Out_var.type, - lod_level=Out_var.lod_level, - persistable=False, - is_data=False, - need_check_feed=Out_var.desc.need_check_feed()) - # copy Out_var's dist_attr to intermediate_var_0's dist_attr - copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, - Out_var) - - matmul_v2_op = dst_block.append_op( - type='matmul_v2', - inputs=inputs, - outputs={'Out': intermediate_var_0}, - attrs=attrs) - - c_allreduce_sum_op = dst_block.append_op( - type='c_allreduce_sum', - inputs={'X': intermediate_var_0}, - outputs={'Out': Out_var}, - attrs={ - 'ring_id': group.id, - 'use_calc_stream': True, - 'use_model_parallel': True - }) - - # copy serial op's dist_attr to dist op's dist_attr - copy_distributed_attr_for_dist_op(matmul_v2_op, dst_block, - op_dist_attr) - copy_distributed_attr_for_dist_op(c_allreduce_sum_op, dst_block, - op_dist_attr) - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + startup_block = dist_op_helper.get_dst_startup_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism + if rank_id not in op_dist_attr.get_process_mesh().process_group: + rank_id = _get_corresponding_rank(op_dist_attr.get_process_mesh(), + rank_id) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Weight_var = main_block.var(kwargs['Y'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + + # TODO infer logic comm presentation + matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping( + Weight_var.name)[0] + assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format( + matmul_row_dim_mapping) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + process_mesh_group = op_dist_attr.get_process_mesh().process_group + + parallel_axis = matmul_row_dim_mapping + group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape, + parallel_axis, rank_id) + group = new_process_group(group_ranks) + + check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'], + 'linear') + check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], + 'linear') + attrs = {'trans_x': False, 'trans_y': False} + inputs = {'X': X_var, 'Y': Weight_var} + intermediate_var_0 = main_block.create_var( + shape=Out_var.shape, + dtype=Out_var.dtype, + type=Out_var.type, + lod_level=Out_var.lod_level, + persistable=False, + is_data=False, + need_check_feed=Out_var.desc.need_check_feed()) + # copy Out_var's dist_attr to intermediate_var_0's dist_attr + copy_distributed_attr_for_var(op_dist_attr, intermediate_var_0, Out_var) + + matmul_v2_op = main_block.append_op( + type='matmul_v2', + inputs=inputs, + outputs={'Out': intermediate_var_0}, + attrs=attrs) + + c_allreduce_sum_op = main_block.append_op( + type='c_allreduce_sum', + inputs={'X': intermediate_var_0}, + outputs={'Out': Out_var}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'use_model_parallel': True + }) + + # copy serial op's dist_attr to dist op's dist_attr + copy_distributed_attr_for_dist_op(matmul_v2_op, main_block, + op_dist_attr) + copy_distributed_attr_for_dist_op(c_allreduce_sum_op, main_block, + op_dist_attr) + + # init param sync + if Weight_var.is_parameter: + _init_param_sync(Weight_var, dist_op_helper, startup_block, ctx, + rank_id) + + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) # ReplicateParallel @@ -808,6 +985,10 @@ def update_dims_mapping(self, op_dist_attr): changed = True return changed + @staticmethod + def backward(ctx, *args, **kwargs): + _right_operand_parameter_matmul_backward(ctx, *args, **kwargs) + register_distributed_operator_impl("matmul_v2", DistributedMatmulV2Impl0("column_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py index e7fbe9cfebad8..39e97850b8656 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py @@ -42,7 +42,7 @@ def __init__(self, name): super(DistributedReshapeImpl0, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -97,82 +97,72 @@ def update_dims_mapping(self, op_dist_attr): return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 3, "Dist op of Reshape take 3 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 2, "Dist op of Reshape take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "Dist op of Reshape input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['ShapeTensor'] - ) <= 1, "Dist op of Reshape input ShapeTensor take 0 or 1 variable but got {}".format( - input_name_mapping['ShapeTensor']) - assert len( - input_name_mapping['Shape'] - ) <= 1, "Dist op of Reshape input Shape take 0 or 1 variable but got {}".format( - input_name_mapping['Shape']) - assert len( - output_name_mapping['Out'] - ) == 1, "Dist op of Reshape input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - assert len( - output_name_mapping['XShape'] - ) == 1, "Dist op of Reshape input XShape take 1 variable but got {}".format( - input_name_mapping['XShape']) - - X_var = dst_block.var(input_name_mapping['X'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - XShape_var = dst_block.var(output_name_mapping['XShape'][0]) - shape_list = src_op.desc.attr("shape") - ShapeTensor_var_list = [] - for name in input_name_mapping['ShapeTensor']: - ShapeTensor_var_list.append(name) - Shape_var_list = [] - for name in input_name_mapping['Shape']: - Shape_var_list.append(name) - - # got dist attribute info - dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) - process_mesh_shape = op_dist_attr.get_process_mesh().topology - - # modify target shape - for idx, axis in enumerate(dim_mapping): - if axis >= 0: - if len(shape_list) > idx: - shape_list[idx] = shape_list[idx] // process_mesh_shape[ - axis] - - # create op - new_op_desc = dst_block.desc.append_op() - new_op_desc.copy_from(src_op.desc) - new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) - new_op_desc.set_input('Shape', Shape_var_list) - new_op_desc.set_input('X', [X_var.name]) - new_op_desc.set_output('XShape', [XShape_var.name]) - new_op_desc.set_output('Out', [Out_var.name]) - new_op_desc._set_attr('shape', shape_list) - - dst_block._sync_with_cpp() - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + XShape_var = main_block.var(kwargs['XShape'][0]) + shape_list = src_op.desc.attr("shape") + ShapeTensor_var_list = [] + for name in kwargs['ShapeTensor']: + ShapeTensor_var_list.append(name) + Shape_var_list = [] + for name in kwargs['Shape']: + Shape_var_list.append(name) + + # got dist attribute info + dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + + # modify target shape + for idx, axis in enumerate(dim_mapping): + if axis >= 0: + if len(shape_list) > idx: + shape_list[idx] = shape_list[idx] // process_mesh_shape[ + axis] + + # create op + new_op_desc = main_block.desc.append_op() + new_op_desc.copy_from(src_op.desc) + new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) + new_op_desc.set_input('Shape', Shape_var_list) + new_op_desc.set_input('X', [X_var.name]) + new_op_desc.set_output('XShape', [XShape_var.name]) + new_op_desc.set_output('Out', [Out_var.name]) + new_op_desc._set_attr('shape', shape_list) + + main_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + pass class DistributedReshapeImpl1(DistributedOperatorImpl): @@ -180,7 +170,7 @@ def __init__(self, name): super(DistributedReshapeImpl1, self).__init__() self._name = name self._forward_implemented = True - self._backward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -235,82 +225,72 @@ def update_dims_mapping(self, op_dist_attr): return changed - def forward(self, serial_op): - def static_handle(dst_block, - src_op, - op_dist_attr, - input_name_mapping, - output_name_mapping, - rank_id=0): - assert len( - input_name_mapping - ) == 3, "Dist op of Reshape take 3 inputs variable but got {}".format( - input_name_mapping) - assert len( - output_name_mapping - ) == 2, "Dist op of Reshape take 2 inputs variable but got {}".format( - output_name_mapping) - assert len( - input_name_mapping['X'] - ) == 1, "Dist op of Reshape input X take 1 variable but got {}".format( - input_name_mapping['X']) - assert len( - input_name_mapping['ShapeTensor'] - ) <= 1, "Dist op of Reshape input ShapeTensor take 0 or 1 variable but got {}".format( - input_name_mapping['ShapeTensor']) - assert len( - input_name_mapping['Shape'] - ) <= 1, "Dist op of Reshape input Shape take 0 or 1 variable but got {}".format( - input_name_mapping['Shape']) - assert len( - output_name_mapping['Out'] - ) == 1, "Dist op of Reshape input Out take 1 variable but got {}".format( - input_name_mapping['Out']) - assert len( - output_name_mapping['XShape'] - ) == 1, "Dist op of Reshape input XShape take 1 variable but got {}".format( - input_name_mapping['XShape']) - - X_var = dst_block.var(input_name_mapping['X'][0]) - Out_var = dst_block.var(output_name_mapping['Out'][0]) - XShape_var = dst_block.var(output_name_mapping['XShape'][0]) - shape_list = src_op.desc.attr("shape") - ShapeTensor_var_list = [] - for name in input_name_mapping['ShapeTensor']: - ShapeTensor_var_list.append(name) - Shape_var_list = [] - for name in input_name_mapping['Shape']: - Shape_var_list.append(name) - - # got dist attribute info - dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) - process_mesh_shape = op_dist_attr.get_process_mesh().topology - - # modify target shape - for idx, axis in enumerate(dim_mapping): - if axis >= 0: - if len(shape_list) > idx: - shape_list[idx] = shape_list[idx] // process_mesh_shape[ - axis] - - # create op - new_op_desc = dst_block.desc.append_op() - new_op_desc.copy_from(src_op.desc) - new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) - new_op_desc.set_input('Shape', Shape_var_list) - new_op_desc.set_input('X', [X_var.name]) - new_op_desc.set_output('XShape', [XShape_var.name]) - new_op_desc.set_output('Out', [Out_var.name]) - new_op_desc._set_attr('shape', shape_list) - - dst_block._sync_with_cpp() - - if in_dygraph_mode(): - raise NotImplementedError( - "Dist op for [{}] with idx [{}] is NOT implemented yet.".format( - "matmul", 0)) - else: - return static_handle + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + + dist_op_helper = ctx.get_dist_op_helper() + main_block = dist_op_helper.get_dst_main_program().global_block() + src_op = dist_op_helper.get_cur_src_op() + rank_id = dist_op_helper.get_rank_id() + op_dist_attr = ctx.get_op_distributed_attr_for_program(src_op) + assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format( + str(src_op)) + + # check validation of inputs / outputs + for input_name in src_op.desc.input_names(): + assert input_name in kwargs, "input [{}] is not given".format( + input_name) + assert len(kwargs[input_name]) == len( + src_op.desc.input(input_name) + ), "number of tensor for input [{}] is not match".format(input_name) + for output_name in src_op.desc.output_names(): + assert output_name in kwargs, "input [{}] is not given".format( + output_name) + assert len(kwargs[output_name]) == len( + src_op.desc.output(output_name) + ), "number of tensor for input [{}] is not match".format( + output_name) + + X_var = main_block.var(kwargs['X'][0]) + Out_var = main_block.var(kwargs['Out'][0]) + XShape_var = main_block.var(kwargs['XShape'][0]) + shape_list = src_op.desc.attr("shape") + ShapeTensor_var_list = [] + for name in kwargs['ShapeTensor']: + ShapeTensor_var_list.append(name) + Shape_var_list = [] + for name in kwargs['Shape']: + Shape_var_list.append(name) + + # got dist attribute info + dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name) + process_mesh_shape = op_dist_attr.get_process_mesh().topology + + # modify target shape + for idx, axis in enumerate(dim_mapping): + if axis >= 0: + if len(shape_list) > idx: + shape_list[idx] = shape_list[idx] // process_mesh_shape[ + axis] + + # create op + new_op_desc = main_block.desc.append_op() + new_op_desc.copy_from(src_op.desc) + new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) + new_op_desc.set_input('Shape', Shape_var_list) + new_op_desc.set_input('X', [X_var.name]) + new_op_desc.set_output('XShape', [XShape_var.name]) + new_op_desc.set_output('Out', [Out_var.name]) + new_op_desc._set_attr('shape', shape_list) + + main_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + pass register_distributed_operator_impl("reshape2", diff --git a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py index dc78bdee1fb14..56be75b3beaf2 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_softmax.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_softmax.py @@ -37,6 +37,8 @@ class DistributedSoftmaxImpl(DistributedOperatorImpl): def __init__(self, name): super(DistributedSoftmaxImpl, self).__init__() self._name = name + self._forward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -86,6 +88,10 @@ def update_dims_mapping(self, op_dist_attr): return changed + @staticmethod + def backward(ctx, *args, **kwargs): + pass + register_distributed_operator_impl( "softmax", DistributedSoftmaxImpl("replicate_last_axis")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py index c2ca4d85fdf10..10b8bf2666f4b 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_transpose.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_transpose.py @@ -37,6 +37,8 @@ class DistributedTranspose2Impl(DistributedOperatorImpl): def __init__(self, name): super(DistributedTranspose2Impl, self).__init__() self._name = name + self._forward_implemented = False + self._backward_implemented = True def is_process_mesh_compatible(self, op_dist_attr): """ No restriction for now. """ @@ -82,6 +84,10 @@ def update_dims_mapping(self, op_dist_attr): return changed + @staticmethod + def backward(ctx, *args, **kwargs): + pass + register_distributed_operator_impl( "transpose2", DistributedTranspose2Impl("same_mapping_transpose")) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index 1437dbb2f9049..8f4a4866eb8db 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -94,10 +94,8 @@ def parallelize(self, # The last step: remove all distributed attributes to be compatiable # with inference. self._remove_distributed_attrs(partitioned_main_prog) - - complete_backward_annotation(partitioned_main_prog, self._dist_context) - make_data_unshard(partitioned_main_prog, partitioned_startup_prog) + reshard(partitioned_main_prog, partitioned_startup_prog, rank, self._dist_context) diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py index b67f1e1ab97f2..c0a91f4b53a0d 100755 --- a/python/paddle/distributed/auto_parallel/partitioner.py +++ b/python/paddle/distributed/auto_parallel/partitioner.py @@ -23,15 +23,15 @@ from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype from paddle.fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_ from paddle.distributed.auto_parallel.operators.common import get_distributed_operator -from paddle.distributed.auto_parallel.operators.common import find_best_compatible_distributed_operator_impl from paddle.fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops, ClipGradByGlobalNorm from paddle.distributed.fleet.base.distributed_strategy import DistributedStrategy -from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed.auto_parallel.context import DistributedContext, DistOpHelper from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op, is_optimizer_op from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY from .process import new_process_group from .interface import _g_process_mesh_map -from .utils import _get_comm_group +from .attribute import OperatorDistributedAttribute +from paddle.distributed.auto_parallel.completion import complete_backward_annotation, complete_update_annotation __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"] @@ -122,16 +122,6 @@ def __init__(self, dist_strategy, auto_parallel_context, rank_id=0): # should be set to False self._compatible_with_auto_backward = True - # data parallelism - self._enable_data_parallel = False - self._dp_degree = 0 - self._dp_group = None - - # tensor parallelism - self._enable_tensor_parallel = False - self._tp_degree = 0 - self._tp_group = None - def transpile_forward(self, serial_main_program, serial_startup_program): """ take serial forward programs with shard annotation, create a new distributed forward programs based on the serial ones. @@ -236,9 +226,6 @@ def transpile_forward_impl(self, main_program, startup_program): raise RuntimeError( "Not all vars or ops are annotated in main program !") - # determine parallelism mode - self._determine_parallel_mode(main_program) - # dist op & partition vars new_main_prog, new_startup_program = self._dist_var_op_forward_transpile( main_program, startup_program) @@ -270,11 +257,6 @@ def apply_backward_impl(self, self._sharding_backward_transpile(new_main_prog, new_startup_program) - # Data Parallel pass - if self._enable_data_parallel: - self._gradient_sync_transpile(dist_main_program, - dist_startup_program) - return params_grads def apply_optimize_impl(self, user_define_optimizer, params_grads, @@ -311,9 +293,78 @@ def _dist_var_op_forward_transpile(self, partitioned_main_prog = fluid.Program() partitioned_global_block = partitioned_main_prog.global_block() - serial_global_block = serial_main_program.global_block() + serial_main_block = serial_main_program.global_block() serial_ops = serial_main_program.global_block().ops + # transpile startup program + if serial_startup_program == None: + partitioned_startup_prog = None + else: + partitioned_startup_prog = fluid.Program() + # create parameter + partitioned_startup_global_block = partitioned_startup_prog.global_block( + ) + param2shape = {} + temp_varname_map = {} + for var in serial_startup_program.list_vars(): + if isinstance(var, Parameter): + # TODO if var not belong to this rank, should be filtered + serial_main_var = serial_main_block.var(var.name) + dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( + serial_main_var) + target_shape = _get_dist_shape(serial_main_var, dist_attr) + new_name = var.name + self._dist_varname_suffix + temp_varname_map[var.name] = new_name + _partition_parameter(self._auto_parallel_context, + serial_main_var, + partitioned_startup_global_block, + new_name, target_shape) + param2shape[new_name] = target_shape + + # copy initializer + for op in serial_startup_program.global_block().ops: + # TODO if var not belong to this rank, should be filtered + output_vars = op.desc.output_arg_names() + assert len( + output_vars + ) == 1, "initializer should output only ONE variable, but got [{}]".format( + str(op.desc)) + assert temp_varname_map[output_vars[ + 0]] in param2shape, "try to initialize [{}] which is not a Parameter".format( + output_vars[0]) + new_op_desc = partitioned_startup_global_block.desc.append_op() + new_op_desc.copy_from(op.desc) + new_op_desc._rename_output(output_vars[0], + temp_varname_map[output_vars[0]]) + new_op_desc._set_attr( + "shape", param2shape[temp_varname_map[output_vars[0]]]) + partitioned_startup_global_block._sync_with_cpp() + + # set distribute atrribute + new_op = partitioned_startup_global_block.ops[-1] + assert new_op.type == new_op_desc.type() + assert new_op.desc == new_op_desc + output_var = partitioned_startup_global_block.var(output_vars[ + 0]) + output_var_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( + output_var) + op_attr = OperatorDistributedAttribute( + new_op, self._auto_parallel_context) + op_attr.set_process_mesh(output_var_attr.get_process_mesh()) + op_attr.set_output_dims_mapping( + output_var.name, output_var_attr.get_dims_mapping()) + op_attr.set_input_dims_mapping( + output_var.name, output_var_attr.get_dims_mapping()) + self._auto_parallel_context.set_op_distributed_attr_for_program( + new_op, op_attr) + + # TODO move helper init to a comm place + dist_op_helper = self._auto_parallel_context.get_dist_op_helper() + dist_op_helper.set_dst_main_program(partitioned_main_prog) + dist_op_helper.set_dst_startup_program(partitioned_startup_prog) + dist_op_helper.set_varname_mapping(self._serial2dist_varname_mapping) + dist_op_helper.set_rank_id(self._rank_id) + # transpile main program for op in serial_ops: @@ -321,9 +372,9 @@ def _dist_var_op_forward_transpile(self, for serial_input_varname in op.desc.input_arg_names(): if serial_input_varname not in self._serial2dist_varname_mapping: new_varname = serial_input_varname + self._dist_varname_suffix - if serial_global_block.has_var(serial_input_varname): + if serial_main_block.has_var(serial_input_varname): _partition_var(self._auto_parallel_context, - serial_global_block, + serial_main_block, partitioned_global_block, serial_input_varname, new_varname) else: @@ -337,118 +388,27 @@ def _dist_var_op_forward_transpile(self, if serial_output_varname not in self._serial2dist_varname_mapping: new_varname = serial_output_varname + self._dist_varname_suffix _partition_var(self._auto_parallel_context, - serial_global_block, - partitioned_global_block, + serial_main_block, partitioned_global_block, serial_output_varname, new_varname) self._serial2dist_varname_mapping[ serial_output_varname] = new_varname # partition op - if _found_match_dist_op(self._auto_parallel_context, op): - # replace with corresponding dist op - _insert_dist_op(op, partitioned_global_block, - self._serial2dist_varname_mapping, - self._auto_parallel_context, self._rank_id) + kinputs, koutputs = dist_op_helper.prepare_forward_context(op) + dist_attr = self._auto_parallel_context.get_op_distributed_attr_for_program( + op) + if _is_dist_op_forward_implement(self._auto_parallel_context, op): + dist_ops = get_distributed_operator(op.type) + dist_op_impl = dist_ops.get_impl(dist_attr.get_impl_idx()) + dist_op_impl.forward(self._auto_parallel_context, **kinputs, + **koutputs) + else: # replicate op - _insert_src_op(op, partitioned_global_block, - self._serial2dist_varname_mapping) - - # transpile startup program - if serial_startup_program == None: - partitioned_startup_prog = None - else: - partitioned_startup_prog = fluid.Program() - # create parameter - partitioned_startup_global_block = partitioned_startup_prog.global_block( - ) - param2shape = {} - for var in partitioned_main_prog.list_vars(): - if isinstance(var, Parameter): - _partition_parameter(self._auto_parallel_context, var, - partitioned_startup_global_block, - var.name, var.shape) - param2shape[var.name] = var.shape - - # copy initializer - for op in serial_startup_program.global_block().ops: - output_vars = op.desc.output_arg_names() - assert len( - output_vars - ) == 1, "initializer should output only ONE variable, but got [{}]".format( - str(op.desc)) - assert self._serial2dist_varname_mapping[output_vars[ - 0]] in param2shape, "try to initialize [{}] which is not a Parameter".format( - output_vars[0]) - new_op_desc = partitioned_startup_global_block.desc.append_op() - new_op_desc.copy_from(op.desc) - new_op_desc._rename_output( - output_vars[0], - self._serial2dist_varname_mapping[output_vars[0]]) - new_op_desc._set_attr("shape", param2shape[ - self._serial2dist_varname_mapping[output_vars[0]]]) - partitioned_startup_global_block._sync_with_cpp() - - # MP broadcast not split parameter - # NOTE Theoretically, the MP param init broadcast should be handled by - # each dist op itself. but if we insert the broadcast op at that moment, the broadcast - # will before the initializer, which lead to a undertermined case. - if self._enable_tensor_parallel: - param_to_sync = [] - for param in partitioned_startup_prog.all_parameters(): - if not self._is_var_distributed(param): - param_to_sync.append(param) - # FIXME the ring id should be set by autoparallel.mapping module - # it should be determined by dp groups butfixed it here for hacking - partitioned_startup_global_block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': self._tp_group.id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block.append_op( - type='c_sync_comm_stream', - inputs={'X': param_to_sync}, - outputs={'Out': param_to_sync}, - attrs={ - 'ring_id': self._tp_group.id, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block._sync_with_cpp() - - # DP init param broadcast - if self._enable_data_parallel: - # parameters initialization synchronization - param_to_sync = [] - - for param in partitioned_startup_global_block.all_parameters(): - param_to_sync.append(param) - - # FIXME the ring id should be set by autoparallel.mapping module - # it should be determined by dp groups butfixed it here for hacking - partitioned_startup_global_block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': self._dp_group.id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block.append_op( - type='c_sync_comm_stream', - inputs={'X': param_to_sync}, - outputs={'Out': param_to_sync}, - attrs={ - 'ring_id': self._dp_group.id, - OP_ROLE_KEY: OpRole.Forward - }) - partitioned_startup_global_block._sync_with_cpp() + dist_ops = get_distributed_operator("default") + dist_op_impl = dist_ops.get_impl(0) + dist_op_impl.forward(self._auto_parallel_context, **kinputs, + **koutputs) return partitioned_main_prog, partitioned_startup_prog @@ -493,12 +453,65 @@ def _dist_var_op_backward_transpile(self, for param in no_grad_set ] - return _auto_backward( + dist_op_helper = self._auto_parallel_context.get_dist_op_helper() + params_and_grads = _auto_backward( dist_loss, dist_startup_program, parameter_list=parameter_list, no_grad_set=no_grad_set, - callbacks=callbacks) + callbacks=callbacks, + distop_context=dist_op_helper) + + # backward completion + complete_backward_annotation( + dist_main_program, dist_context=self._auto_parallel_context) + + # transpiler backward for dist op + # get backward ops + ops = dist_main_program.global_block().ops + first_backward_op_idx = -1 + forward_op_id2forward_op = {} + for idx in range(len(ops)): + if is_forward_op(ops[idx]): + forward_op_id2forward_op[ops[idx].desc.id()] = ops[idx] + + if int(ops[idx].attr('op_role')) == int(OpRole.Backward): + first_backward_op_idx = idx + break + assert first_backward_op_idx >= 0, "not found backward ops in program" + assert len(forward_op_id2forward_op + ) > 0, "not found forward ops in program" + + backward_ops = ops[first_backward_op_idx:] + for backward_op in backward_ops: + # if the backward op has a corresponding forward op + if backward_op.desc.id() in dist_op_helper.gradopidx2opidx: + forward_op_id = dist_op_helper.gradopidx2opidx[ + backward_op.desc.id()] + forward_op = forward_op_id2forward_op[forward_op_id] + # TODO backward attr should has _impl_idx + forward_op_dist_attr = self._auto_parallel_context.get_op_distributed_attr_for_program( + forward_op) + # TODO use the backward op itself to find the dist op + dist_ops = get_distributed_operator(forward_op.type) + kinputs, koutputs = dist_op_helper.prepare_backward_context( + backward_op) + + # TODO use backward op itself to determine impl idx + if _is_dist_op_backward_implement( + self._auto_parallel_context, forward_op): + dist_op_impl = dist_ops.get_impl( + forward_op_dist_attr.get_impl_idx()) + dist_op_impl.backward(self._auto_parallel_context, + **kinputs, **koutputs) + else: + # replicate op + dist_ops = get_distributed_operator("default") + dist_op_impl = dist_ops.get_impl(0) + dist_op_impl.backward(self._auto_parallel_context, + **kinputs, **koutputs) + + return params_and_grads # replace dist grad ops else: raise RuntimeError("transpile NOT implemented !") @@ -509,6 +522,10 @@ def _optimize_transpile(self, user_define_optimizer, params_grads, with program_guard(main_program, startup_program): optimize_ops = user_define_optimizer.apply_gradients(params_grads) + # update completion + complete_update_annotation( + main_program, dist_context=self._auto_parallel_context) + return optimize_ops def _is_valid_annotated_program(self, program): @@ -544,47 +561,6 @@ def _serial_varname2dist_var(self, serial_varname, dist_program): return dist_var - def _determine_parallel_mode(self, program): - """ - determine the parallelism that is enabled - NOTE a hard rule and should be updated in future - """ - - for param in program.all_parameters(): - if self._is_var_distributed(param): - self._enable_tensor_parallel = True - break - - for var in program.list_vars(): - var_dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( - var) - if not var_dist_attr.is_parameter(): - mapping = var_dist_attr.get_dims_mapping() - mesh = var_dist_attr.get_process_mesh().topology - if mapping and mapping[0] >= 0 and mesh[mapping[0]] > 1: - self._enable_data_parallel = True - break - - # tensor parallelism - if self._enable_tensor_parallel: - model_parallel_axis, process_mesh = self._auto_parallel_context._get_model_parallel_info( - ) - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - model_parallel_axis, self._rank_id) - self._tp_degree = len(group_ranks) - self._tp_group = new_process_group(group_ranks) - - # data parallelism - data_parallel_axis, process_mesh = self._auto_parallel_context._get_data_parallel_info( - ) - if self._enable_data_parallel: - group_ranks = _get_comm_group(process_mesh.process_group, - process_mesh.topology, - data_parallel_axis, self._rank_id) - self._dp_degree = len(group_ranks) - self._dp_group = new_process_group(group_ranks) - def _is_var_distributed(self, var): dist_attr = self._auto_parallel_context.get_tensor_distributed_attr_for_program( @@ -629,68 +605,6 @@ def _sharding_optimize_transpile(self, params_grads, dist_main_program, """ raise RuntimeError("sharding transpile is NOT implemented !") - def _gradient_sync_transpile(self, main_program, startup_program): - """ - append the gradient allreduce ops for all parameters' grad in case of Data Parallel - """ - - # scale loss by dp degree - main_global_block = main_program.global_block() - for idx, op in reversed(list(enumerate(main_global_block.ops))): - if is_loss_grad_op(op): - loss_grad_var = main_global_block.vars[op.output_arg_names[0]] - main_global_block._insert_op_without_sync( - idx + 1, - type='scale', - inputs={'X': loss_grad_var}, - outputs={'Out': loss_grad_var}, - attrs={ - 'scale': 1.0 / self._dp_degree, - OP_ROLE_KEY: OpRole.Backward - }) - break - main_global_block._sync_with_cpp() - - # gradient synchronization - # NOTE naive gradient sync without overlapping - # so there is not need to sync between calc and comm - # collecting grad var - grad_to_sync = [] - for idx, op in reversed(list(enumerate(main_global_block.ops))): - if is_backward_op(op) and \ - OP_ROLE_VAR_KEY in op.attr_names: - op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] - if len(op_role_var) != 0: - assert len(op_role_var) % 2 == 0 - for i in range(0, len(op_role_var), 2): - param, reduced_grad = op_role_var[i], op_role_var[i + 1] - assert (reduced_grad not in grad_to_sync) - grad_to_sync.append(reduced_grad) - if is_optimizer_op(op): - first_optimize_op_idx = idx - - # insert allreduce - for grad in grad_to_sync: - # FIXME the ring id should be set by autoparallel.mapping module - # it should be determined by dp groups butfixed it here for hacking - main_global_block.append_op( - type='c_allreduce_sum', - inputs={'X': grad}, - outputs={'Out': grad}, - attrs={ - 'ring_id': self._dp_group.id, - 'root': 0, - 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Backward - }) - main_global_block.append_op( - type='c_sync_comm_stream', - inputs={'X': grad_to_sync}, - outputs={'Out': grad_to_sync}, - attrs={'ring_id': self._dp_group.id, - OP_ROLE_KEY: OpRole.Backward}) - main_global_block._sync_with_cpp() - def _get_no_grad_set_name(no_grad_set): no_grad_set_name = set() @@ -723,7 +637,7 @@ def _get_no_grad_set(loss, no_grad_set=None): return no_grad_set -def _found_match_dist_op(auto_paralle_context, op): +def _is_dist_op_forward_implement(auto_paralle_context, op): dist_attr = auto_paralle_context.get_op_distributed_attr_for_program(op) dist_ops = get_distributed_operator(op.type) @@ -731,11 +645,20 @@ def _found_match_dist_op(auto_paralle_context, op): dist_attr.get_impl_idx())._forward_implemented +def _is_dist_op_backward_implement(auto_paralle_context, op): + dist_attr = auto_paralle_context.get_op_distributed_attr_for_program(op) + dist_ops = get_distributed_operator(op.type) + + return dist_ops and dist_attr.get_impl_idx() >= 0 and dist_ops.get_impl( \ + dist_attr.get_impl_idx())._backward_implemented + + def _auto_backward(loss, startup_program=None, parameter_list=None, no_grad_set=None, - callbacks=None): + callbacks=None, + distop_context=None): """ modification is inplaced """ @@ -753,9 +676,14 @@ def _auto_backward(loss, loss.shape) program = loss.block.program + with program_guard(program, startup_program): - params_grads = append_backward(loss, parameter_list, act_no_grad_set, - callbacks) + params_grads = append_backward( + loss, + parameter_list, + act_no_grad_set, + callbacks, + distop_context=distop_context) return params_grads @@ -822,6 +750,7 @@ def _partition_parameter(auto_paralle_context, src_var, dst_block, dst_varname, # param.desc.set_distributed_attr_uid(distributed_attr_uid) dist_attr = copy.deepcopy( auto_paralle_context.get_tensor_distributed_attr_for_program(src_var)) + assert dist_attr is not None dist_attr._owner_tensor = param dist_attr._owner_context = auto_paralle_context.get_tensor_distributed_attr_for_program( src_var)._owner_context @@ -848,6 +777,7 @@ def _partition_intermediate_var(auto_paralle_context, src_var, dst_block, # var.desc.set_distributed_attr_uid(distributed_attr_uid) dist_attr = copy.deepcopy( auto_paralle_context.get_tensor_distributed_attr_for_program(src_var)) + assert dist_attr is not None dist_attr._owner_tensor = var dist_attr._owner_context = auto_paralle_context.get_tensor_distributed_attr_for_program( src_var)._owner_context @@ -923,3 +853,11 @@ def _insert_dist_op(src_op, dst_block, varname_mapping, auto_paralle_context, input_mapping, output_mapping, rank_id=rank_id) + + +def is_forward_op(op): + role1 = int(core.op_proto_and_checker_maker.OpRole.Forward) | int( + core.op_proto_and_checker_maker.OpRole.Loss) + role2 = int(core.op_proto_and_checker_maker.OpRole.Forward) + op_role = int(op.attr('op_role')) + return op_role == role2 or op_role == role1 diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index a81ff69918905..813bd481d9286 100755 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -15,6 +15,7 @@ import threading import paddle.fluid.core as core import numpy as np +from .interface import _g_process_mesh_map def is_valid_list_index(list, index): @@ -171,7 +172,9 @@ def _get_comm_group(processes, shape, axis, rank): """ # NOTE _linear_idx2coordinate assume processes mesh start with 0 and continuous - # tricks to support processes mesh when it is not start with 0 or continuous + # tricks to support processes mesh when it is not start with 0 or continuous + assert rank in processes, "rank [{}] is NOT in processes group {}".format( + rank, processes) rank_relatvie = processes.index(rank) coordinate = _linear_idx2coordinate(shape, rank_relatvie) coordinates_in_group = [coordinate[:] for i in range(shape[axis])] @@ -189,6 +192,25 @@ def _get_comm_group(processes, shape, axis, rank): return sorted(ranks_in_group) +def _get_idx_in_axis(processes, shape, axis, rank): + """ + Given a rank and the processes mesh the rank belongs to, + compute the index of the rank in given axis. + + Example: 27 processes managed in a 3-Dimensinal mesh with shape of [3, 3, 3]. + the index of rank 22 are: + in axis 0: 1 + in axis 1: 1 + in axis 2: 2 + """ + + # NOTE _linear_idx2coordinate assume processes mesh start with 0 and continuous + # tricks to support processes mesh when it is not start with 0 or continuous + rank_relatvie = processes.index(rank) + coordinate = _linear_idx2coordinate(shape, rank_relatvie) + return coordinate[axis] + + def _coordinate2linear_idx(mesh_shape, coordinate): """ convert a coordinate in multidimensional mesh space into a scala idx in linear space. @@ -279,6 +301,27 @@ def _linear_idx2coordinate(mesh_shape, linear_idx): return coordinate +def _get_corresponding_rank(target_mesh, rank): + + # TODO(JZ-LIANG) a hack method to support varying mesh in Pipeline parallelism case. + # we assume that all mesh are evenly divide from a parent mesh and should have same size. + # to revise this in future. + + coordinate = None + for key, mesh in _g_process_mesh_map.items(): + if key == 0: + continue + if rank in mesh.process_group and mesh.topology == target_mesh.topology: + coordinate = _linear_idx2coordinate(mesh.topology, + mesh.process_group.index(rank)) + break + + assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format( + rank) + return target_mesh.process_group[_coordinate2linear_idx(mesh.topology, + coordinate)] + + def _get_unshard_dist_shape(var, dist_attr): var_shape = var.shape mapping = dist_attr.get_dims_mapping() diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index d62f7b5941126..9ea407c760f07 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -1051,7 +1051,8 @@ def _append_backward_ops_(block, grad_to_var, callbacks=None, input_grad_names_set=None, - op_path_dict=None): + op_path_dict=None, + distop_context=None): """ Create all grad ops, and insert them into given block @@ -1108,6 +1109,10 @@ def _append_backward_ops_(block, # Getting op's corresponding grad_op grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list) + if distop_context is not None: + for op_desc in grad_op_desc: + assert op_desc.id() not in distop_context.gradopidx2opidx + distop_context.gradopidx2opidx[op_desc.id()] = op.desc.id() # Set device for grad_op according to forward Op device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() @@ -1402,7 +1407,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, callbacks=None, - checkpoints=None): + checkpoints=None, + distop_context=None): """ :api_attr: Static Graph @@ -1617,7 +1623,8 @@ def append_backward(loss, grad_to_var, callbacks, input_grad_names_set=input_grad_names_set, - op_path_dict=op_path_dict) + op_path_dict=op_path_dict, + distop_context=distop_context, ) grad_info_map = dict() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 90f59758a2faf..745e711852272 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -32,6 +32,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) +list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper) list(APPEND DIST_TEST_OPS test_parallel_class_center_sample) @@ -221,6 +222,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_parallel) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_tensor_parallel) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel) + list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer) list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers) LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_mixed_precision) @@ -1002,6 +1004,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) + set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_class_center_sample PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py new file mode 100755 index 0000000000000..89880f8c2f49d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +from paddle.fluid import layers +from paddle.distributed import fleet +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr +import paddle.fluid.core as core + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +ROOT_MESH = auto.ProcessMesh([0, 1]) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") + + def forward(self, input): + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + out = self.dropout(out) + out = self.linear2(out) + + return out + + +def mlp_pretrain_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32') + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1, -1]) + auto.set_pipeline_stage(1) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + predict = mlp(input) + + cost = layers.cross_entropy(input=predict, label=label) + avg_cost = layers.mean(x=cost) + + return avg_cost, train_program, start_program + + +class TestMLPAutoParallelizer(unittest.TestCase): + def test_mlp_serial(self): + + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.amp = False + dist_strategy.pipeline = False + dist_strategy.recompute = False + + # init parallel optimizer + dist_strategy.semi_auto = True + + fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + loss, train_program, start_program = mlp_pretrain_forward(train_program, + start_program) + + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + + optimizer = fleet.distributed_optimizer(optimizer) + _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( + loss, start_program) + suffix = core.kAutoParallelSuffix() + for block in distributed_main_program.blocks: + for op in block.ops: + for attr_name in op.attr_names: + self.assertTrue(suffix not in attr_name) + # print_program_with_distributed_attr(distributed_main_program) + self.assertIsNotNone(distributed_startup_program) + self.assertIsNotNone(distributed_main_program) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py index a92e1e2f338b1..7147716c74ccd 100755 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_parallelizer.py @@ -15,130 +15,16 @@ from __future__ import print_function import unittest +import paddle.fluid as fluid -# The following statements are used to satisfy fleet initialization -import os -if os.getenv("CUDA_VISIBLE_DEVICES", None) is None: - os.environ["CUDA_VISIBLE_DEVICES"] = '0' +from test_parallel_dygraph_dataparallel import TestMultipleGpus -import paddle -import paddle.nn as nn -import paddle.static as static -import paddle.nn.functional as F -import paddle.utils as utils -from paddle.fluid import layers -from paddle.distributed import fleet -import paddle.distributed.auto_parallel as auto -from paddle.distributed.auto_parallel.utils import print_program_with_distributed_attr -import paddle.fluid.core as core -paddle.enable_static() -_global_parallel_strategy = None -_global_process_mesh = None -ROOT_MESH = auto.ProcessMesh([0, 1]) +class TestParallelizer(TestMultipleGpus): - -class MLPLayer(nn.Layer): - def __init__(self, - hidden_size=1024, - intermediate_size=4 * 1024, - dropout_ratio=0.1, - initializer_range=0.02): - super(MLPLayer, self).__init__() - d_model = hidden_size - dim_feedforward = intermediate_size - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( - mean=0.0, std=initializer_range)) - bias_attr = None - - self.linear0 = nn.Linear( - d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) - self.linear1 = nn.Linear( - dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) - self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr) - self.norm = nn.LayerNorm(d_model, epsilon=1e-5) - self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") - - def forward(self, input): - out = self.norm(input) - out = self.linear0(out) - out = F.gelu(out, approximate=True) - out = self.linear1(out) - out = self.dropout(out) - out = self.linear2(out) - - return out - - -def mlp_pretrain_forward(train_program, start_program): - with static.program_guard(train_program, - start_program), utils.unique_name.guard(): - batch_size = 4 - hidden_size = 1024 - sequence_len = 512 - input = static.data( - name="input", - shape=[batch_size, sequence_len, hidden_size], - dtype='float32') - label = static.data( - name="label", shape=[batch_size, sequence_len, 1], dtype='float32') - - auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1, -1]) - auto.set_pipeline_stage(1) - - mlp = MLPLayer( - hidden_size=hidden_size, - intermediate_size=4 * hidden_size, - dropout_ratio=0.1, - initializer_range=0.02) - - predict = mlp(input) - - cost = layers.cross_entropy(input=predict, label=label) - avg_cost = layers.mean(x=cost) - - return avg_cost, train_program, start_program - - -class TestMLPAutoParallelizer(unittest.TestCase): - def test_mlp_serial(self): - - global _global_process_mesh - _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) - - dist_strategy = fleet.DistributedStrategy() - dist_strategy.amp = False - dist_strategy.pipeline = False - dist_strategy.recompute = False - - # init parallel optimizer - dist_strategy.semi_auto = True - - fleet.init(is_collective=True, strategy=dist_strategy) - - train_program = static.Program() - start_program = static.Program() - loss, train_program, start_program = mlp_pretrain_forward(train_program, - start_program) - - optimizer = paddle.fluid.optimizer.AdamOptimizer( - learning_rate=0.00001, - beta1=0.9, - beta2=0.999, - epsilon=1e-08, - grad_clip=None) - - optimizer = fleet.distributed_optimizer(optimizer) - _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( - loss, start_program) - suffix = core.kAutoParallelSuffix() - for block in distributed_main_program.blocks: - for op in block.ops: - for attr_name in op.attr_names: - self.assertTrue(suffix not in attr_name) - # print_program_with_distributed_attr(distributed_main_program) - self.assertIsNotNone(distributed_startup_program) - self.assertIsNotNone(distributed_main_program) + # check sharding logic as well as the accuracy with single mode + def test_parallelizer_logic(self): + self.run_mnist_2gpu('auto_parallel_parallelizer.py') if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py index 29ba863c96226..44a525244015b 100755 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py @@ -92,9 +92,9 @@ def check_tensor_split(prog1, varnames1, prog2, varnames2, axis, nsplit): def initialization_check(mode, dist_context, dist_startup_prog, - serial_startup_prog, var_need_broadcast): + serial_startup_prog, var_need_broadcast, process_mesh, + mp_parallel_axis, dp_parallel_axis): if 'mp' in mode: - mp_parallel_axis, process_mesh = dist_context._get_model_parallel_info() group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, mp_parallel_axis, 3) @@ -110,7 +110,6 @@ def initialization_check(mode, dist_context, dist_startup_prog, return False if 'dp' in mode: - dp_parallel_axis, process_mesh = dist_context._get_data_parallel_info() group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, dp_parallel_axis, 3) @@ -359,9 +358,15 @@ def test_mlp_dp(self): # parameter initialization var_need_broadcast = [] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=None, + dp_parallel_axis=0)) def test_mlp_mp(self): global _global_parallel_strategy @@ -406,9 +411,15 @@ def test_mlp_mp(self): var_need_broadcast = sorted( ['layer_norm_0.b_0', 'layer_norm_0.w_0', 'linear_1.b_0']) self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=0, + dp_parallel_axis=None)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -464,9 +475,15 @@ def test_mlp_dp_mp(self): var_need_broadcast = sorted( ['layer_norm_0.b_0', 'layer_norm_0.w_0', 'linear_1.b_0']) self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=1, + dp_parallel_axis=0)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -635,9 +652,15 @@ def test_attn_dp(self): # parameter initialization var_need_broadcast = [] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=None, + dp_parallel_axis=0)) def test_attn_mp(self): global _global_parallel_strategy @@ -686,9 +709,15 @@ def test_attn_mp(self): # parameter initialization var_need_broadcast = ['linear_3.b_0'] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=0, + dp_parallel_axis=None)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -748,9 +777,15 @@ def test_attn_dp_mp(self): # parameter initialization var_need_broadcast = ['linear_3.b_0'] self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=1, + dp_parallel_axis=0)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -1043,9 +1078,15 @@ def test_decoder_dp_mp(self): 'layer_norm_0.w_0', 'linear_5.b_0' ]) self.assertTrue( - initialization_check(_global_parallel_strategy, dist_context, - dist_startup_prog, serial_startup_prog, - var_need_broadcast)) + initialization_check( + _global_parallel_strategy, + dist_context, + dist_startup_prog, + serial_startup_prog, + var_need_broadcast, + _global_process_mesh, + mp_parallel_axis=1, + dp_parallel_axis=0)) # check var and op all have dist_attr in dist_main_program self.assertTrue( @@ -1117,7 +1158,16 @@ def test_decoder_noparallel(self): 'fill_constant', 'gaussian_random', 'fill_constant', 'gaussian_random', 'fill_constant', 'gaussian_random', 'fill_constant', 'gaussian_random', 'fill_constant', - 'gaussian_random', 'fill_constant', 'fill_constant', 'fill_constant' + 'gaussian_random', 'fill_constant', 'fill_constant', + 'fill_constant', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast' ] self.assertTrue(dist_ops == ref_ops) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py index 16cbad3ef6f8b..11b3338bc675c 100755 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py @@ -521,7 +521,7 @@ class GPTModel(nn.Layer): def __init__(self, vocab_size, hidden_size=768, - num_hidden_layers=12, + num_hidden_layers=4, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", @@ -787,6 +787,14 @@ def test_gpt_dp_mp(self): dist_params_grads = partitioner.apply_backward( loss, complete_train_program, start_program, auto_parallel_main_prog, auto_parallel_startup_prog) + + with open("./test_auto_parallel_partitioner_serial_main_new.txt", + "w") as fw: + fw.write(str(train_program)) + with open("./test_auto_parallel_partitioner_serial_startup_new.txt", + "w") as fw: + fw.write(str(start_program)) + optimizer = paddle.fluid.optimizer.AdamOptimizer( learning_rate=0.00001, beta1=0.9, @@ -796,7 +804,17 @@ def test_gpt_dp_mp(self): opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, auto_parallel_main_prog, auto_parallel_startup_prog) - + from paddle.distributed.auto_parallel.context import set_default_distributed_context + set_default_distributed_context(dist_context) + with open("./test_auto_parallel_partitioner_main_new.txt1", "w") as fw: + fw.write(str(auto_parallel_main_prog)) + with open("./test_auto_parallel_partitioner_startup_new.txt1", + "w") as fw: + fw.write(str(auto_parallel_startup_prog)) + # with open("./test_auto_parallel_partitioner_main_completed.txt", "w") as fw: + # from paddle.distributed.auto_parallel.completion import complete_backward_annotation + # complete_backward_annotation(auto_parallel_main_prog) + # fw.write(str(auto_parallel_main_prog)) nrank = 4 # col parallel weights = [ @@ -826,16 +844,20 @@ def test_gpt_dp_mp(self): 'layer_norm_6.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_7.tmp_2', 'layer_norm_8.tmp_2' ] - mp_parallel_axis, process_mesh = dist_context._get_model_parallel_info() + process_mesh = _global_process_mesh + mp_parallel_axis = 1 + dp_parallel_axis = 0 + group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, mp_parallel_axis, 3) mp_ring_id = new_process_group(group_ranks).id - dp_parallel_axis, process_mesh = dist_context._get_data_parallel_info() + group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, dp_parallel_axis, 3) dp_ring_id = new_process_group(group_ranks).id + tensor_parallel_allreduce_vars = sorted([ op.desc.output_arg_names()[0].split("@")[0] for op in auto_parallel_main_prog.global_block().ops diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py index da82e56d4a151..fe9b965ed8733 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -25,7 +25,6 @@ from paddle.distributed.auto_parallel.context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.completion import complete_backward_annotation from paddle.distributed.auto_parallel.reshard import reshard from paddle.distributed.auto_parallel.process import PROCESS_GROUP_MAP @@ -211,7 +210,8 @@ def check_initialization_for_dp(dist_startup_prog): if op.type == "c_broadcast": broadcast_varnames.append(op.output_arg_names[0]) - return params == need_check_params == broadcast_varnames + return sorted(params) == sorted(need_check_params) == sorted( + broadcast_varnames) class TestMLPReshard(unittest.TestCase): @@ -225,7 +225,6 @@ def test_complete_backward_annotation(self): rank_id = 0 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, 0) - complete_backward_annotation(dist_main_prog, dist_context) op_need_check = None for op in dist_main_prog.global_block().ops: @@ -254,7 +253,6 @@ def test_mlp_pp(self): rank_id = 1 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) - complete_backward_annotation(dist_main_prog, dist_context) for key in list(PROCESS_GROUP_MAP.keys()): del PROCESS_GROUP_MAP[key] reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) @@ -277,7 +275,6 @@ def test_mlp_dp(self): rank_id = 0 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) - complete_backward_annotation(dist_main_prog, dist_context) reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) # send and recv should not exist in dp scene. self.assertFalse(check_send_recv_result(dist_main_prog, rank_id)) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py index 1e134eebfd23b..babc622393c40 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py @@ -25,7 +25,6 @@ from paddle.distributed.auto_parallel.context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.completion import complete_backward_annotation from paddle.distributed.auto_parallel.reshard import reshard paddle.enable_static() @@ -158,7 +157,6 @@ def test_mlp_dpmppp(self): dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) print(dist_main_prog) - complete_backward_annotation(dist_main_prog, dist_context) reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) print(dist_main_prog) print(dist_startup_prog) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py index 5a10a21834570..96a8b2a8d7cdb 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py @@ -25,7 +25,6 @@ from paddle.distributed.auto_parallel.context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner -from paddle.distributed.auto_parallel.completion import complete_backward_annotation from paddle.distributed.auto_parallel.reshard import reshard paddle.enable_static() @@ -187,7 +186,6 @@ def test_mlp_mppp(self): rank_id = 2 dist_main_prog, dist_startup_prog = get_dist_prog( train_program, startup_program, dist_context, rank_id) - complete_backward_annotation(dist_main_prog, dist_context) reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) # check send and recv result From c285c71916035e433b45e7642c17d31092b45199 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Wed, 20 Oct 2021 10:25:40 +0800 Subject: [PATCH 038/116] [FIX] Extend time for test_activation_nn_grad to avoid its timeout issue (#36527) * native commit for triple grad of sigmod * Updated unittests files * init functional jacobian api * Updated trible_test func * Updated gradient_checker & test_script * finish test with dtype float32 * add float64 test case * polish code * use atol=1e-5 with dtype float64 * fix for ci * set timeout for test_jacobian * fix dygraph grad to support high differential * polish API docstring * Updated gradient checker and some related files * fix double grad strip error for high differential * fix double grad strip error for high differential * Add Sigmoid triple grad tests * fix dygraph double grad dtype error when calling for high differential senario * Updated triple grad teses func * Use np.random to initialize ddx * Updated triple_grad_check func * add todo for gradient checker and refine some comments * remove additional code * add test for warnging in backward.py * add tanh triple grad * format python code * refine code * make test_activation_nn_grad test time to 150s Co-authored-by: veyron95 Co-authored-by: levi131 --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 745e711852272..ac7471f8edfa4 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -846,7 +846,7 @@ set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120) set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120) -set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 120) +set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 150) set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120) set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120) From 4bd19770d9dc485a559f3ac698ba3a4d2c117943 Mon Sep 17 00:00:00 2001 From: wenbin Date: Wed, 20 Oct 2021 10:44:22 +0800 Subject: [PATCH 039/116] fix (#36557) * fix * remove const --- .../inference/tensorrt/convert/pool2d_op.cc | 35 +++++++++++++------ 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index e03842db2b827..05cd7bad5cbac 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -115,17 +115,17 @@ class Pool2dOpConverter : public OpConverter { nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); nvinfer1::ILayer *layer = nullptr; - nvinfer1::DimsHW pre_pad(0, 0); - nvinfer1::DimsHW post_pad(0, 0); + nvinfer1::DimsHW g_pre_pad(0, 0); + nvinfer1::DimsHW g_post_pad(0, 0); // paddle Non ceil_mode : Output size = (input size - filter size + 2 * // padding) / stride (stride size) + 1 // tensorrt EXPLICIT_ROUND_DOWN: O = floor((M - DK) / S) + 1 // so if M - DK < 0 we need extra padding if (input_shape.d[input_dims - 2] - ksize[0] + 2 * paddings[0] < 0) { - post_pad.h() = strides[0] - 1; + g_post_pad.h() = strides[0] - 1; } if (input_shape.d[input_dims - 1] - ksize[1] + 2 * paddings[1] < 0) { - post_pad.w() = strides[1] - 1; + g_post_pad.w() = strides[1] - 1; } if (op_desc.HasAttr("enable_int8")) { @@ -138,10 +138,10 @@ class Pool2dOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { - if ((post_pad.w() > 0 || post_pad.h() > 0) && + if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && (padding_algorithm != "SAME")) { auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, - pre_pad, post_pad); + g_pre_pad, g_post_pad); PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( "Pad layer in poolOp converter could not be " @@ -230,22 +230,35 @@ class Pool2dOpConverter : public OpConverter { if (!adaptive) { if (ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, input_dims); - } - - if ((post_pad.w() > 0 || post_pad.h() > 0) && - (padding_algorithm != "SAME")) { auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, pre_pad, post_pad); + PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( "Pad layer in poolOp converter could not be " "created. The pointer to pad layer is `NULL`.")); input1 = pad_layer->getOutput(0); } - +#if IS_TRT_VERSION_GE(8000) + // Exclude padding pixels from the average mean is not supported well by + // TRT + // so enable padding for trt8.0 above. + if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && + (padding_algorithm != "SAME") && !ceil_mode) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + g_pre_pad, g_post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } +#endif auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( From 6524fa8d335725d6d86e43c0fc809538650f6645 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Wed, 20 Oct 2021 11:08:58 +0800 Subject: [PATCH 040/116] Add CINN Compile Option (#36292) Add CINN compile option in CMake. Now you can use CINN in Paddle by `-DWITH_CINN=ON` when `cmake` To test it, you can run `make cinn_lib_test -j` and `ctest -R cinn_lib_test`. Note: 1. You should set ``` export runtime_include_dir=${CINN_SOURCE_DIR}/cinn/runtime/cuda ``` When run test, the `${CINN_SOURCE_DIR}` should be set based on your CINN directory. 2. CINN is under developing now, you may have to change `CINN_GIT_TAG` to the git commit you need. --- CMakeLists.txt | 5 + cmake/cinn.cmake | 112 +++++++++++++++ paddle/fluid/framework/ir/CMakeLists.txt | 3 + paddle/fluid/framework/ir/cinn_lib_test.cc | 151 +++++++++++++++++++++ 4 files changed, 271 insertions(+) create mode 100644 cmake/cinn.cmake create mode 100644 paddle/fluid/framework/ir/cinn_lib_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 98772e9678153..d4a0eb067b4f1 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) +option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) @@ -299,6 +300,10 @@ if(WITH_GPU) endif() endif() +if(WITH_CINN) + include(cinn) +endif() + if(WITH_ROCM) include(hip) include(miopen) # set miopen libraries, must before configure diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake new file mode 100644 index 0000000000000..dd5f809e9581a --- /dev/null +++ b/cmake/cinn.cmake @@ -0,0 +1,112 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if (NOT WITH_CINN) + return() +endif() + +# TODO(zhhsplendid): CINN has lots of warnings during early development. +# They will be treated as errors under paddle. We set no-error now and we will +# clean the code in the future. +add_definitions(-w) + +###################################### +# Build CINN from Git External Project +###################################### +include(ExternalProject) +set(CINN_SOURCE_DIR ${THIRD_PARTY_PATH}/CINN) +# TODO(zhhsplendid): Modify git tag after we have release tag +set(CINN_GIT_TAG 3f004bfa3ed273ecf1de8e7b946433038c79b84f) +set(CINN_OPTIONAL_ARGS -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} -DPUBLISH_LIBS=ON) +set(CINN_BUILD_COMMAND $(MAKE) cinncore -j && $(MAKE) cinnapi -j) +ExternalProject_Add( + external_cinn + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git" + GIT_TAG ${CINN_GIT_TAG} + PREFIX ${CINN_SOURCE_DIR} + UPDATE_COMMAND "" + BUILD_COMMAND ${CINN_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS ${CINN_OPTIONAL_ARGS}) + + + +ExternalProject_Get_property(external_cinn BINARY_DIR) +ExternalProject_Get_property(external_cinn SOURCE_DIR) +set(CINN_BINARY_DIR ${BINARY_DIR}) +set(CINN_SOURCE_DIR ${SOURCE_DIR}) + +message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}") +message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}") + + +######################### +# Add CINN's dependencies +######################### + +# Add absl +set(ABSL_LIB_NAMES + hash + wyhash + city + strings + throw_delegate + bad_any_cast_impl + bad_optional_access + bad_variant_access + raw_hash_set + ) +set(ABSL_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/lib") +set(ABSL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/include") +add_library(absl STATIC IMPORTED GLOBAL) +set_target_properties(absl PROPERTIES IMPORTED_LOCATION ${ABSL_LIB_DIR}/libabsl_base.a) +foreach(lib_name ${ABSL_LIB_NAMES}) + target_link_libraries(absl INTERFACE ${ABSL_LIB_DIR}/libabsl_${lib_name}.a) +endforeach() +include_directories(${ABSL_INCLUDE_DIR}) + +# Add isl +set(ISL_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/lib") +set(ISL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/include") +add_library(isl STATIC IMPORTED GLOBAL) +set_target_properties(isl PROPERTIES IMPORTED_LOCATION ${ISL_LIB_DIR}/libisl.a) +include_directories(${ISL_INCLUDE_DIR}) + +# Add LLVM +set(LLVM_LIB_NAMES + ExecutionEngine + ) +set(LLVM_LIB_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/lib") +set(LLVM_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/include") +add_library(llvm STATIC IMPORTED GLOBAL) +set_target_properties(llvm PROPERTIES IMPORTED_LOCATION ${LLVM_LIB_DIR}/libLLVMCore.a) +foreach(lib_name ${LLVM_LIB_NAMES}) + target_link_libraries(llvm INTERFACE ${LLVM_LIB_DIR}/libLLVM${lib_name}.a) +endforeach() +include_directories(${LLVM_INCLUDE_DIR}) + +###################################################### +# Put external_cinn and dependencies together as a lib +###################################################### + +set(CINN_LIB_NAME "libcinnapi.so") +set(CINN_LIB_LOCATION "${CINN_BINARY_DIR}/dist/cinn/lib") +set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include") + +add_library(cinn SHARED IMPORTED GLOBAL) +set_target_properties(cinn PROPERTIES IMPORTED_LOCATION "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}") +include_directories(${CINN_INCLUDE_DIR}) +add_dependencies(cinn external_cinn absl isl llvm glog gflag) + diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 904450b5b251e..7b80d331ff707 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -143,6 +143,9 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +if (WITH_CINN) + cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) +endif() cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) diff --git a/paddle/fluid/framework/ir/cinn_lib_test.cc b/paddle/fluid/framework/ir/cinn_lib_test.cc new file mode 100644 index 0000000000000..cdee45a06c71a --- /dev/null +++ b/paddle/fluid/framework/ir/cinn_lib_test.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#include "cinn/common/target.h" +#include "cinn/frontend/net_builder.h" +#include "cinn/frontend/syntax.h" +#include "cinn/hlir/framework/graph.h" +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/pass.h" +#include "cinn/hlir/framework/tensor.h" +#include "cinn/hlir/op/use_ops.h" +#include "cinn/hlir/pass/use_pass.h" + +namespace cinn { +namespace frontend { + +Program CreateAddProgram() { + constexpr int M = 32; + constexpr int N = 24; + + NetBuilder builder("net_builder"); + auto a = builder.CreateInput(Float(32), {M, N}); + auto b = builder.CreateInput(Float(32), {M, N}); + auto c = builder.add(a, b); + auto d = builder.add(a, c); + auto program = builder.Build(); + + return program; +} + +void SetRandData(hlir::framework::Tensor tensor, Target target) { + auto* data = tensor->mutable_data(target); + std::random_device seed; + std::default_random_engine engine(seed()); + std::uniform_real_distribution dist(0.f, 1.f); + size_t num_ele = tensor->shape().numel(); + std::vector random_data(num_ele); + for (size_t i = 0; i < num_ele; i++) { + random_data[i] = dist(engine); // All random data + } + +#ifdef PADDLE_WITH_CUDA + cudaMemcpy(data, random_data.data(), num_ele * sizeof(float), + cudaMemcpyHostToDevice); +#else + std::copy(random_data.begin(), random_data.end(), data); +#endif +} + +TEST(net_build, basic) { + auto program = CreateAddProgram(); + // output program + for (size_t i = 0; i < program.size(); i++) { + LOG(INFO) << "instruction: " << program[i]; + } +} + +TEST(net_build, program_execute_multi_elementwise_add) { + auto program = CreateAddProgram(); +#ifdef PADDLE_WITH_CUDA + Target target = common::DefaultNVGPUTarget(); +#else + Target target = common::DefaultHostTarget(); +#endif + + auto graph = std::make_shared(program, target); + std::cout << "graph:\n" << graph->Visualize() << std::endl; + + auto scope = BuildScope(target, graph); + hlir::framework::GraphCompiler gc(target, scope, graph); + auto runtime_program = gc.Build(); + + scope->Var("A"); + scope->Var("B"); + + auto A = scope->GetTensor("A"); + auto B = scope->GetTensor("B"); + SetRandData(A, target); + SetRandData(B, target); + + runtime_program->Execute(); +} + +TEST(net_build, program_execute_fc) { + constexpr int B = 10; // batch size + constexpr int M = 32; + constexpr int K = 18; + constexpr int N = 24; + + NetBuilder builder("net_builder"); + auto a = builder.CreateInput(Float(32), {B, M, K}, "A"); + auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight + auto b = builder.CreateInput(Float(32), {N}, "B"); // bias + + auto mul_out = builder.mul(a, w, 2, 1); + auto add_out = builder.add(mul_out, b); + auto program = builder.Build(); + +#ifdef PADDLE_WITH_CUDA + Target target = common::DefaultNVGPUTarget(); +#else + Target target = common::DefaultHostTarget(); +#endif + + auto graph = std::make_shared(program, target); + auto scope = BuildScope(target, graph); + hlir::framework::GraphCompiler gc(target, scope, graph); + auto runtime_program = gc.Build(); + + scope->Var(std::string(a.id())); + scope->Var(std::string(w.id())); + scope->Var(std::string(b.id())); + scope->Var(std::string(mul_out->id)); + + auto a_ten = scope->GetTensor(std::string(a.id())); + auto w_ten = scope->GetTensor(std::string(w.id())); + auto b_ten = scope->GetTensor(std::string(b.id())); + auto fake_out_ten = scope->GetTensor(std::string(mul_out->id)); + auto add_out_ten = scope->GetTensor(std::string(add_out->id)); + SetRandData(a_ten, target); + SetRandData(w_ten, target); + SetRandData(b_ten, target); + + runtime_program->Execute(); +} + +} // namespace frontend +} // namespace cinn From 8ca5206bab9ab6e13bf9367e431a3211b70a900b Mon Sep 17 00:00:00 2001 From: zmx Date: Wed, 20 Oct 2021 11:15:59 +0800 Subject: [PATCH 041/116] fix SerializeSelectedRows (#36543) * bug fix for DeserializeSelectedRows. test=develop * fix bug for SerializeSelectedRows. test=develop * update. test=develop --- paddle/fluid/distributed/service/brpc_utils.cc | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 376e820cb7a74..92dcde99cccb0 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -138,23 +138,11 @@ void SerializeSelectedRows(framework::Variable* var, var_data->clear(); var_data->resize(rows->size() * sizeof(int64_t)); char* data_ptr = const_cast(var_data->data()); - - if (platform::is_cpu_place(tensor->place())) { - memcpy(data_ptr, &(*rows)[0], rows->size() * sizeof(int64_t)); - } else { -#ifdef PADDLE_WITH_CUDA - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(platform::CPUPlace(), data_ptr, - BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), - &(*rows)[0], rows->size() * sizeof(int64_t), stream); -#endif - } + memcpy(data_ptr, &((*rows)[0]), rows->size() * sizeof(int64_t)); var_msg->set_data_type(static_cast(tensor->type())); for (auto& dim : framework::vectorize(tensor->dims())) { var_msg->add_dims(dim); } - // IO Buffer if (platform::is_cpu_place(tensor->place())) { auto data_len = tensor->numel() * framework::SizeOfType(tensor->type()); From 06bd348d3c62874511f6f36af760063b50e054ca Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 20 Oct 2021 11:26:10 +0800 Subject: [PATCH 042/116] update for trt convert ut. (#36549) --- paddle/fluid/inference/tensorrt/op_teller.cc | 8 +++ .../inference/test_trt_convert_activation.py | 1 + .../test_trt_convert_affine_channel.py | 1 + .../inference/test_trt_convert_elementwise.py | 1 + .../test_trt_convert_emb_eltwise_layernorm.py | 1 + .../ir/inference/test_trt_convert_flatten.py | 65 +++++++++++++++---- .../ir/inference/test_trt_convert_gather.py | 1 + .../inference/test_trt_convert_gather_nd.py | 1 + .../ir/inference/test_trt_convert_gelu.py | 1 + .../inference/test_trt_convert_group_norm.py | 1 + .../ir/inference/test_trt_convert_prelu.py | 14 ++++ .../ir/inference/test_trt_convert_reshape.py | 1 + .../ir/inference/test_trt_convert_scale.py | 1 + .../test_trt_convert_shuffle_channel.py | 1 + .../ir/inference/test_trt_convert_swish.py | 1 + .../inference/test_trt_convert_transpose.py | 1 + 16 files changed, 88 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index e7318d07611ea..0d0a656c5b607 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1104,6 +1104,14 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } } + +#if IS_TRT_VERSION_LT(7000) + if (!with_dynamic_shape) { + // TODO(inference): fix trt6 static plugin error. + VLOG(3) << "prelu static plugin in trt6 has bug."; + return false; + } +#endif } if (op_type == "mish") { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py index 9dc89bb9836d0..a87cab3430cd3 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py index 1e6c94f145497..33eb90b9f9123 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_affine_channel.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index c8cba0f372380..992e0353837bc 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py index d7b0bcd908085..356a2c942df0d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertEmbEltwiseLayernormTest1(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py index 4b461c75f0b28..7b0089ab9ab7f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -73,10 +74,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs @@ -157,10 +168,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs @@ -241,10 +262,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs @@ -325,10 +356,20 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if attrs[0]['axis'] == 1: - return 1, 2 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7130: + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 else: - return 0, 3 + if dynamic_shape: + return 0, 3 + + if attrs[0]['axis'] == 1: + return 1, 2 + else: + return 0, 3 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py index 9a3c9aff61b98..37d23cb18d843 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py @@ -19,6 +19,7 @@ from functools import partial from typing import Optional, List, Callable, Dict, Any, Set import logging +import unittest class TrtConvertGatherTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py index a109abdc298a6..0c7eae5f85f95 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertGatherNdTest_dim_4_1(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py index f9c3d09ef446f..2f75e4e723e28 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertGeluTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py index b6b5aa9dbfe95..203e86c4b25de 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertGroupNormTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py index 4122e2623cb5a..fbb78fceb3e84 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertPreluTest(TrtLayerAutoScanTest): @@ -186,6 +187,19 @@ def teller2(program_config, predictor_config): "Need to repair the case: the output of GPU and tensorrt has diff when the input dimension is 2 in static shape mode." ) + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 < 7000: + + def teller(program_config, predictor_config): + if not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller, SkipReasons.TRT_NOT_IMPLEMENTED, + "Need to repair the case: the output of GPU and tensorrt has diff in trt6, the prelu static plugin has bug." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py index cf7ab11c35de7..4355b83557fc6 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertReshapeTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py index 8a44617dc8dc3..51bcee080376e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertScaleTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py index 264ba31ad2716..c6a8147236044 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertShuffleChannelTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py index e162988bbb1b3..5eb4e8505ff22 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_swish.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertSwishTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py index ad325bb0ab3b0..31b4d027f1780 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertTransposeTest(TrtLayerAutoScanTest): From 7325c9fb44e9ae600bc299ff1badfa87873ed5eb Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 20 Oct 2021 11:26:22 +0800 Subject: [PATCH 043/116] add unittest (#36371) --- paddle/fluid/inference/tensorrt/op_teller.cc | 109 ++++++++++++++++-- .../tensorrt/plugin/hard_swish_op_plugin.h | 2 +- .../test_trt_convert_anchor_generator.py | 6 +- .../inference/test_trt_convert_batch_norm.py | 13 +++ .../ir/inference/test_trt_convert_clip.py | 18 ++- .../ir/inference/test_trt_convert_concat.py | 13 +++ .../ir/inference/test_trt_convert_dropout.py | 9 +- .../test_trt_convert_hard_sigmoid.py | 1 + .../test_trt_convert_multihead_matmul.py | 7 +- .../inference/test_trt_convert_reduce_sum.py | 10 +- .../inference/test_trt_convert_roi_align.py | 2 + .../test_trt_convert_skip_layernorm.py | 1 + .../ir/inference/test_trt_convert_slice.py | 6 +- .../ir/inference/test_trt_convert_softmax.py | 13 ++- .../ir/inference/test_trt_convert_split.py | 13 +++ .../ir/inference/test_trt_convert_stack.py | 1 + .../ir/inference/test_trt_convert_tile.py | 10 +- .../ir/inference/test_trt_convert_yolo_box.py | 1 + .../ir/inference/trt_layer_auto_scan_test.py | 8 +- 19 files changed, 208 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 0d0a656c5b607..91515f1fa5811 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -174,6 +174,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << " op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "activation op does not support input's dim is 2 in " + "tensorrt static shape, the output shape has diff."; + return false; + } } if (op_type == "pool2d") { @@ -346,6 +352,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } } + if (op_type == "softmax") { + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "softmax op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } + } if (op_type == "group_norm") { if (!with_dynamic_shape) return false; bool has_attrs = (desc.HasAttr("epsilon") && desc.HasAttr("groups")); @@ -357,20 +381,35 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (op_type == "concat") { if (!desc.HasAttr("axis")) { return false; + } + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (with_dynamic_shape) { + if (axis < 0) return false; } else { - int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); - if (with_dynamic_shape) { - if (axis < 0) return false; - } else { - if (axis <= 0) return false; - } - auto concat_inputs = desc.Inputs(); - if (concat_inputs.find("AxisTensor") != concat_inputs.end()) { - if (desc.Input("AxisTensor").size() >= 1) { - return false; - } + if (axis <= 0) return false; + } + auto concat_inputs = desc.Inputs(); + if (concat_inputs.find("AxisTensor") != concat_inputs.end()) { + if (desc.Input("AxisTensor").size() >= 1) { + return false; } } + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "concat op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "transpose2" || op_type == "transpose") { if (!desc.HasAttr("axis")) { @@ -687,6 +726,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << desc.Output("Y").size() << "."; return false; } + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "batch_norm op does not support input's dim is 2 in " + "tensorrt static shape, the output shape has diff."; + return false; + } } if (op_type == "split") { @@ -774,6 +829,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "The output_length should be equal to the output size."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "split op does not support input's dim is 2 in tensorrt " + "static shape. The output shape has diff."; + return false; + } } if (op_type == "scale") { auto scale_inputs = desc.Inputs(); @@ -926,6 +987,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "gelu op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "gelu op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "layer_norm") { @@ -1041,7 +1108,13 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* x_var_desc = block->FindVar(x_var_name); const auto x_shape = x_var_desc->GetShape(); if (x_shape.size() == 1) { - VLOG(3) << "dropout op does not support input's dim is 1 in tensorrt."; + VLOG(3) << "scale op does not support input's dim is 1 in tensorrt."; + return false; + } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "scale op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; return false; } } @@ -1061,6 +1134,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "swish op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "swish op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "prelu") { @@ -1314,6 +1393,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "clip op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "clip op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "reduce_sum" || op_type == "reduce_mean") { diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h index c0ee608c39dab..475c908c13bbf 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h @@ -161,7 +161,7 @@ class HardSwishPluginDynamicCreator : public nvinfer1::IPluginCreator { public: HardSwishPluginDynamicCreator() {} const char* getPluginName() const TRT_NOEXCEPT override { - return "hardswish_plugin_dynamic"; + return "hard_swish_plugin_dynamic"; } const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py index bf457a9da40a8..2dd380c53af44 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -83,7 +84,10 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - return 1, 3 + if dynamic_shape: + return 1, 3 + else: + return 0, 4 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py index ceda10d5d94aa..fc96f297918dd 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -211,6 +212,18 @@ def teller1(program_config, predictor_config): self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT MomentumTensor NOT SUPPORT") + def teller2(program_config, predictor_config): + if len( + program_config.inputs['batch_norm_input'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py index 95b4fb83d5bfd..081df87d10330 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertClipTest(TrtLayerAutoScanTest): @@ -84,8 +85,7 @@ def generate_weight2(attrs: List[Dict[str, Any]]): yield program_config - def sample_predictor_configs( - self, program_config) -> (paddle_infer.Config, List[int], float): + def sample_predictor_configs(self, program_config): def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} @@ -146,7 +146,21 @@ def generate_trt_nodes_num(attrs, dynamic_shape): yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if len( + program_config.inputs['input_data'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py index 25e96787dd132..78ac06a323b1d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -317,6 +318,18 @@ def teller1(program_config, predictor_config): self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT AxisTensor NOT SUPPORT") + def teller2(program_config, predictor_config): + if len( + program_config.inputs['concat_input1'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py index 28a85ce96c64f..57f5b5a0bb245 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -141,15 +142,19 @@ def generate_trt_nodes_num(attrs, dynamic_shape): def add_skip_trt_case(self): def teller1(program_config, predictor_config): - if self.dims == 2: + if len( + program_config.inputs['input_data'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): return True return False self.add_skip_case( teller1, SkipReasons.TRT_NOT_IMPLEMENTED, - "When input dims is 2, pulgin will product a 4 dims output.") + "The output shape has diff, but we can add shuffle layer to resolve it." + ) def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py index d803d9e461613..c09c7f0bc9c2f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertHardSigmoidTest_dim_2(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py index 0b98ab53fcc29..0754eede6d370 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -26,16 +27,16 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(batch, dim1): - return np.random.randn(batch, dim1, 768).astype(np.float32) + return np.random.random((batch, dim1, 768)).astype(np.float32) def generate_input2(shape): return np.random.random(shape).astype(np.float32) def generate_weight1(): - return np.random.randn(768, 768).astype(np.float32) + return np.random.random((768, 768)).astype(np.float32) def generate_weight2(): - return np.random.randn(768).astype(np.float32) + return np.random.random(768).astype(np.float32) for batch in [1, 2, 4]: self.batch = batch diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py index 91e1c0677ac48..1cc9defa1010b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -84,8 +85,7 @@ def generate_input1(attrs: List[Dict[str, Any]]): yield program_config - def sample_predictor_configs( - self, program_config) -> (paddle_infer.Config, List[int], float): + def sample_predictor_configs(self, program_config): def generate_dynamic_shape(attrs): self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} @@ -117,7 +117,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + attrs, False), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-5) @@ -125,8 +125,8 @@ def generate_trt_nodes_num(attrs, dynamic_shape): # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), generate_trt_nodes_num(attrs, - True), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( attrs, True), (1e-5, 1e-5) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py index 265065c7b357e..56efdb91959ce 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -141,6 +142,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): return 1, 3 else: return 0, 4 + return 0, 4 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py index 11d060847a418..9f3e7a81777c2 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_skip_layernorm.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertSkipLayernormTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py index 725a3085550de..17a2c9cd74c07 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py @@ -143,7 +143,11 @@ def generate_trt_nodes_num(attrs, dynamic_shape): True), 1e-4 def test(self): - self.run_test() + # TODO(inference): fix. + # trt6 and trt7.1 has bug. + # trt7.2 deserialize has bug. + # self.run_test() + pass if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py index e539bd9a56300..4a15a09b0f77e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -135,7 +136,17 @@ def generate_trt_nodes_num(attrs, dynamic_shape): True), 1e-5 def add_skip_trt_case(self): - pass + def teller1(program_config, predictor_config): + if len( + program_config.inputs['softmax_input'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) def test(self): self.add_skip_trt_case() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py index 2db60ccc61b95..f03ed0a335eeb 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py @@ -14,6 +14,7 @@ from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons from program_config import TensorConfig, ProgramConfig +import unittest import numpy as np import paddle.inference as paddle_infer from functools import partial @@ -226,6 +227,18 @@ def teller1(program_config, predictor_config): teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT AxisTensor AND SectionsTensorList NOT SUPPORT.") + def teller2(program_config, predictor_config): + if len( + program_config.inputs['split_input'].shape + ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled(): + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape has diff, but we can add shuffle layer to resolve it." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py index df7914689beaf..93ba5da9d66d9 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertStackTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py index 59ab1a6c5a376..c1a5493fd328a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py @@ -77,10 +77,14 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape == True: - return 0, 3 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7000: + if dynamic_shape == True: + return 0, 3 + else: + return 1, 2 else: - return 1, 2 + return 0, 3 attrs = [ program_config.ops[i].attrs diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py index d6a0aac75c966..17955c6e007d9 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py @@ -18,6 +18,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set +import unittest class TrtConvertYoloBoxTest(TrtLayerAutoScanTest): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py index 3ac185fbb04ac..edd033f28c0ed 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/trt_layer_auto_scan_test.py @@ -81,7 +81,7 @@ def __init__(self, methodName='runTest'): def create_inference_config(self, use_trt=True) -> paddle_infer.Config: config = paddle_infer.Config() - # config.disable_glog_info() + config.disable_glog_info() config.enable_use_gpu(100, 0) config.set_optim_cache_dir(self.trt_cache_dir) if use_trt: @@ -276,11 +276,11 @@ def run_test(self, quant=False): str(prog_config) + ' vs ' + self.inference_config_str( pred_config) + '\033[1;31m \nERROR INFO: {}\033[0m'.format(str(e))) - status = False + if not skip_flag: + status = False continue self.success_log('RUN ' + str(prog_config) + ' vs ' + self.inference_config_str(pred_config)) - # In the first step, we found the problem, and after the subsequent repairs, the assert assertion will be enabled - # self.assertTrue(status) + # self.assertTrue(status) From 605e7f0849eab68deac0c1972441e24824ba1b63 Mon Sep 17 00:00:00 2001 From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 20 Oct 2021 13:30:11 +0800 Subject: [PATCH 044/116] fix pow2 decay (#36559) --- .../pow2_decay_with_linear_warmup_op.cc | 4 +-- .../pow2_decay_with_linear_warmup_op.h | 28 ++++++++----------- python/paddle/fluid/contrib/layers/nn.py | 7 ++--- .../test_pow2_decay_with_linear_warmup_op.py | 18 ++++++------ 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc index 12362b1bc6401..4d919c94f616b 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc @@ -54,8 +54,6 @@ class Pow2DecayWithLinearWarmupOpMaker AddAttr( "total_steps", "(int64_t) The total steps for changing the learning rate."); - AddAttr("start_lr", - "(float) The initial value of the learning rate."); AddAttr("base_lr", "(float) The final learning rate value after warmup."); AddAttr("end_lr", @@ -63,7 +61,7 @@ class Pow2DecayWithLinearWarmupOpMaker AddComment(R"DOC( The Pow2DecayWithLinearWarmup learning rate scheduler. -When step_num < warmup_steps, lr = (base_lr - start_lr) * step_num / warmup_steps + start_lr +When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps When warmup_steps <= step_num <= total_steps, factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps) diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h index 41e07b0343e72..74cf762745077 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h @@ -28,31 +28,30 @@ struct Pow2DecayWithLinearWarmupFunctor { using RestrictPtr = U *PADDLE_RESTRICT; public: - HOSTDEVICE Pow2DecayWithLinearWarmupFunctor( - RestrictPtr lr, RestrictPtr step, size_t warmup_steps, - size_t total_steps, AttrT start_lr, AttrT base_lr, AttrT end_lr) + HOSTDEVICE Pow2DecayWithLinearWarmupFunctor(RestrictPtr lr, + RestrictPtr step, + size_t warmup_steps, + size_t total_steps, AttrT base_lr, + AttrT end_lr) : lr_(lr), step_(step), warmup_steps_(warmup_steps), total_steps_(total_steps), - start_lr_(start_lr), base_lr_(base_lr), end_lr_(end_lr) {} HOSTDEVICE void operator()(size_t) const { - size_t step = static_cast(*step_); - *step_ = static_cast(step + 1); - if (step < warmup_steps_) { - auto new_lr = - static_cast(base_lr_ - start_lr_) * step / warmup_steps_ + - start_lr_; + size_t step = static_cast(*step_) + 1; + *step_ = static_cast(step); + if (step <= warmup_steps_) { + auto new_lr = static_cast(step) / warmup_steps_ * base_lr_; *lr_ = static_cast(new_lr); } else if (step < total_steps_) { auto factor = 1 - static_cast(step - warmup_steps_) / (total_steps_ - warmup_steps_); auto new_lr = - static_cast(base_lr_ - end_lr_) * factor * factor + end_lr_; + static_cast(base_lr_ - end_lr_) * (factor * factor) + end_lr_; *lr_ = static_cast(new_lr); } else { *lr_ = static_cast(end_lr_); @@ -64,7 +63,6 @@ struct Pow2DecayWithLinearWarmupFunctor { RestrictPtr step_; size_t warmup_steps_; size_t total_steps_; - AttrT start_lr_; AttrT base_lr_; AttrT end_lr_; }; @@ -98,7 +96,6 @@ class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(warmup_steps, total_steps, platform::errors::InvalidArgument( "warmup_steps must not be larger than total_steps.")); - auto start_lr = ctx.Attr("start_lr"); auto base_lr = ctx.Attr("base_lr"); auto end_lr = ctx.Attr("end_lr"); @@ -106,11 +103,10 @@ class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { auto *step_data = step_out->data(); auto &dev_ctx = ctx.template device_context(); platform::ForRange for_range(dev_ctx, 1); - using AttrT = float; + using AttrT = double; Pow2DecayWithLinearWarmupFunctor functor( lr_data, step_data, warmup_steps, total_steps, - static_cast(start_lr), static_cast(base_lr), - static_cast(end_lr)); + static_cast(base_lr), static_cast(end_lr)); for_range(functor); } }; diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 0d0addb17e9ae..cb26f05b54984 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -1936,18 +1936,18 @@ def build_program(main_program, startup_program): def pow2_decay_with_linear_warmup(warmup_steps, total_steps, - start_lr, base_lr, end_lr, dtype='float32', name=None): if paddle.fluid.in_dygraph_mode(): raise NotImplementedError( - "pow2_warmup does not support dygraph mode yet.") + "pow2_decay_with_linear_warmup does not support dygraph mode yet.") helper = LayerHelper("pow2_decay_with_linear_warmup", **locals()) lr = helper.create_global_variable(persistable=True, dtype=dtype, shape=[1]) - helper.set_variable_initializer(lr, Constant(value=start_lr)) + helper.set_variable_initializer( + lr, Constant(value=float(base_lr) / warmup_steps)) step = helper.create_global_variable( persistable=True, dtype='int64', shape=[1]) @@ -1963,7 +1963,6 @@ def pow2_decay_with_linear_warmup(warmup_steps, attrs={ "warmup_steps": warmup_steps, "total_steps": total_steps, - "start_lr": start_lr, "base_lr": base_lr, "end_lr": end_lr, }) diff --git a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py index 641ea3eccf8d2..056db5b8590ab 100644 --- a/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py +++ b/python/paddle/fluid/tests/unittests/test_pow2_decay_with_linear_warmup_op.py @@ -19,13 +19,12 @@ import unittest -def gen_pow2_warmup_op_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, - place): +def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place): main = paddle.static.Program() startup = paddle.static.Program() with paddle.static.program_guard(main, startup): - lr = pow2_decay_with_linear_warmup(warmup_steps, total_steps, start_lr, - base_lr, end_lr) + lr = pow2_decay_with_linear_warmup(warmup_steps, total_steps, base_lr, + end_lr) exe = paddle.static.Executor(place) with paddle.static.scope_guard(paddle.static.Scope()): exe.run(startup) @@ -35,7 +34,7 @@ def gen_pow2_warmup_op_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, class Pow2Warmup(LinearWarmup): - def __init__(self, warmup_steps, total_steps, start_lr, base_lr, end_lr): + def __init__(self, warmup_steps, total_steps, base_lr, end_lr): assert total_steps > warmup_steps lr_sch = PolynomialDecay( learning_rate=base_lr, @@ -46,13 +45,13 @@ def __init__(self, warmup_steps, total_steps, start_lr, base_lr, end_lr): super(Pow2Warmup, self).__init__( learning_rate=lr_sch, warmup_steps=warmup_steps, - start_lr=start_lr, + start_lr=0.0, end_lr=base_lr) -def gen_pow2_warmup_py_lr(warmup_steps, total_steps, start_lr, base_lr, end_lr, - place): - lr_sch = Pow2Warmup(warmup_steps, total_steps, start_lr, base_lr, end_lr) +def gen_pow2_warmup_py_lr(warmup_steps, total_steps, base_lr, end_lr, place): + lr_sch = Pow2Warmup(warmup_steps, total_steps, base_lr, end_lr) + lr_sch.step() while True: yield lr_sch() lr_sch.step() @@ -64,7 +63,6 @@ def setUp(self): self.params = { 'warmup_steps': 30, 'total_steps': 100, - 'start_lr': 0.01, 'base_lr': 0.02, 'end_lr': 0.001, } From 873ee4e3802bfdf10eb86b1c8ee46aa2523e18dd Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Wed, 20 Oct 2021 14:28:47 +0800 Subject: [PATCH 045/116] adapt to cann5.0.3_alpha3. (#36106) --- cmake/external/ascend.cmake | 4 +++- .../operators/collective/c_embedding_op_npu.cc | 14 ++++++++++++++ paddle/fluid/operators/fill_constant_op_npu.cc | 10 ++++++++++ paddle/fluid/operators/lookup_table_v2_op_npu.cc | 3 +++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index b643923cdd353..03bc7784e9288 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -92,6 +92,8 @@ macro(find_ascend_toolkit_version ascend_toolkit_version_info) file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") + string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION}) + add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}") if(NOT ASCEND_TOOLKIT_VERSION) set(ASCEND_TOOLKIT_VERSION "???") else() @@ -118,4 +120,4 @@ endif() find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) -endif() \ No newline at end of file +endif() diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc index c2d607223868a..021e5790afe57 100644 --- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc +++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc @@ -68,10 +68,21 @@ void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx, ignore_tensor.Resize(ids_t.dims()); NpuOpRunner sub_runner; +#if (CANN_VERSION_CODE >= 503003) + Tensor factor_tensor(ids_t.type()); + factor_tensor.mutable_data({1}, context.GetPlace()); + TensorFromVector(std::vector{static_cast(start_idx)}, + context.device_context(), &factor_tensor); + sub_runner.SetType("Sub") + .AddInput(ids_t) + .AddInput(factor_tensor) + .AddOutput(id_t); +#else sub_runner.SetType("Sub") .AddInput(ids_t) .AddInput(std::vector{static_cast(start_idx)}) .AddOutput(id_t); +#endif sub_runner.Run(); NpuOpRunner lessequal1_runner; @@ -137,6 +148,9 @@ void NPUGetIdsEmbedding(const framework::ExecutionContext &context) { .AddInput(table_t_pad) .AddInput(ids_t_local) .AddInput(std::vector{0}) +#if (CANN_VERSION_CODE >= 503003) + .AddAttrs({{"batch_dims", 0}}) +#endif .AddOutput(*output_t); runner.Run(); } diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index ae0148a9bf513..16a2433f5cad6 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -66,11 +66,21 @@ class FillConstantNPUKernel : public framework::OpKernel { out_var->mutable_data(shape, ctx.GetPlace()); NpuOpRunner runner; +#if (CANN_VERSION_CODE >= 503003) + runner.SetType("FillD") + .AddInput(tensor_value) + .AddOutput(*out_var) + .AddAttrs( + {{ "dims", + framework::vectorize(shape) }}) + .Run(stream); +#else runner.SetType("Fill") .AddInput(framework::vectorize(shape)) .AddInput(tensor_value) .AddOutput(*out_var) .Run(stream); +#endif } }; } // namespace operators diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 387cd92b69f92..b75ae8a65881a 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,6 +40,9 @@ class LookupTableV2NPUKernel : public framework::OpKernel { .AddInput(*table_t) .AddInput(*ids_t) .AddInput(std::vector{0}) +#if (CANN_VERSION_CODE >= 503003) + .AddAttrs({{"batch_dims", 0}}) +#endif .AddOutput(*output_t); runner.Run(); } From 3f2d6a3f21fee7a95c580d22ffcd708200fd8306 Mon Sep 17 00:00:00 2001 From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com> Date: Wed, 20 Oct 2021 14:55:14 +0800 Subject: [PATCH 046/116] Add FasterTokenizer Operator (#34491) Add Tokenizer related functionalities for Transformer model in order that the process of training and predicting is consistent. * support the text string as an input Tensor * support the "VOCAB"unordered_map as an input Tensor to lookup tokens * Tokenizer used for BERT. This tokenizer applies an end-to-end, text string to wordpiece tokenization. * It first applies basic tokenization, followed by wordpiece tokenization. --- cmake/external/utf8proc.cmake | 51 + cmake/inference_lib.cmake | 5 + cmake/third_party.cmake | 4 + paddle/fluid/framework/CMakeLists.txt | 2 + paddle/fluid/framework/executor.cc | 8 +- paddle/fluid/framework/executor_gc_helper.cc | 1 + paddle/fluid/framework/feed_fetch_method.cc | 20 +- paddle/fluid/framework/feed_fetch_method.h | 4 + paddle/fluid/framework/feed_fetch_type.h | 12 +- paddle/fluid/framework/framework.proto | 9 + paddle/fluid/framework/operator.cc | 4 + paddle/fluid/framework/string_array.cc | 104 ++ paddle/fluid/framework/string_array.h | 48 + paddle/fluid/framework/tensor_util.cc | 3 +- paddle/fluid/framework/tensor_util.h | 14 + paddle/fluid/framework/var_desc.cc | 8 + paddle/fluid/framework/var_type_traits.h | 13 +- paddle/fluid/framework/variable_helper.cc | 5 + paddle/fluid/imperative/variable_wrapper.h | 10 + paddle/fluid/inference/api/CMakeLists.txt | 2 +- .../inference/api/demo_ci/CMakeLists.txt | 7 +- .../inference/api/details/zero_copy_tensor.cc | 57 +- .../api/details/zero_copy_tensor_dummy.cc | 5 +- .../api/details/zero_copy_tensor_test.cc | 3 +- paddle/fluid/inference/api/paddle_api.h | 8 + paddle/fluid/inference/api/paddle_tensor.h | 22 + paddle/fluid/inference/io.cc | 10 +- paddle/fluid/operators/CMakeLists.txt | 7 +- paddle/fluid/operators/controlflow/feed_op.cc | 54 +- .../fluid/operators/controlflow/fetch_op.cc | 12 +- paddle/fluid/operators/load_combine_op.h | 73 +- paddle/fluid/operators/save_combine_op.h | 60 +- paddle/fluid/operators/string/CMakeLists.txt | 6 + .../operators/string/faster_tokenizer_op.cc | 524 +++++++ .../operators/string/faster_tokenizer_op.h | 196 +++ .../operators/string/unity_build_rule.cmake | 8 + paddle/fluid/pybind/imperative.cc | 6 + paddle/fluid/pybind/inference_api.cc | 37 +- paddle/fluid/pybind/op_function_generator.cc | 1 + paddle/fluid/pybind/protobuf.cc | 5 +- paddle/fluid/pybind/pybind.cc | 47 +- python/paddle/fluid/dygraph/jit.py | 17 +- python/paddle/fluid/dygraph/layers.py | 23 +- python/paddle/fluid/dygraph/math_op_patch.py | 7 +- .../fluid/dygraph/varbase_patch_methods.py | 40 +- python/paddle/fluid/executor.py | 8 +- python/paddle/fluid/framework.py | 4 + python/paddle/fluid/inference/wrapper.py | 10 +- .../unittests/test_faster_tokenizer_op.py | 393 ++++++ .../tests/unittests/tokenizer/__init__.py | 13 + .../unittests/tokenizer/bert_tokenizer.py | 517 +++++++ .../unittests/tokenizer/tokenizer_utils.py | 1244 +++++++++++++++++ python/paddle/framework/io.py | 10 +- 53 files changed, 3604 insertions(+), 157 deletions(-) create mode 100644 cmake/external/utf8proc.cmake create mode 100755 paddle/fluid/framework/string_array.cc create mode 100755 paddle/fluid/framework/string_array.h create mode 100644 paddle/fluid/operators/string/CMakeLists.txt create mode 100644 paddle/fluid/operators/string/faster_tokenizer_op.cc create mode 100755 paddle/fluid/operators/string/faster_tokenizer_op.h create mode 100644 paddle/fluid/operators/string/unity_build_rule.cmake create mode 100755 python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py create mode 100644 python/paddle/fluid/tests/unittests/tokenizer/__init__.py create mode 100755 python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py create mode 100644 python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py diff --git a/cmake/external/utf8proc.cmake b/cmake/external/utf8proc.cmake new file mode 100644 index 0000000000000..a5de5c15c3b51 --- /dev/null +++ b/cmake/external/utf8proc.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc) +SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc) +# As we add extra features for utf8proc, we use the non-official repo +SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git) +SET(UTF8PROC_TAG v2.6.1) + +IF(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib") + add_definitions(-DUTF8PROC_STATIC) +ELSE(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a") +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_utf8proc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${UTF8PROC_REPOSITORY} + GIT_TAG ${UTF8PROC_TAG} + PREFIX ${UTF8PROC_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DBUILD_SHARED=ON + -DBUILD_STATIC=ON + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES} +) + +ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES}) +ADD_DEPENDENCIES(utf8proc extern_utf8proc) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 5ffbf15c960a3..dfd93f49e7340 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -124,6 +124,11 @@ function(copy_part_of_thrid_party TARGET DST) SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) + set(dst_dir "${DST}/third_party/install/utf8proc") + copy(${TARGET} + SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + if (WITH_CRYPTO) set(dst_dir "${DST}/third_party/install/cryptopp") copy(${TARGET} diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index b3260ba27b072..d45b5e07bb8f3 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -210,6 +210,10 @@ include(external/threadpool)# download threadpool include(external/dlpack) # download dlpack include(external/xxhash) # download, build, install xxhash include(external/warpctc) # download, build, install warpctc +include(external/utf8proc) # download, build, install utf8proc + +list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) +list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc) include(external/lapack) # download, build, install lapack list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6e57b829ade4e..4dfcf0985b85e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -51,6 +51,8 @@ proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto data_feed_proto) +cc_library(string_array SRCS string_array.cc DEPS utf8proc) + cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) if(WITH_GPU) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index de007c128d754..5f681ec7ea241 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -102,14 +102,18 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, if (var->Persistable()) { auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); + + VLOG(3) << "Initialize Variable " << var->Name(); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; + << " global, which pointer is " << ptr << " type is " + << static_cast(var->GetType()); } else { auto* ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + << " locally, which pointer is " << ptr << "Variable Type " + << static_cast(var->GetType()); } } } else { diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 43eb1ce8c77f8..8c64d65ff4be6 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -125,6 +125,7 @@ void DeleteUnusedTensors(const Scope &scope, for (auto &t : *lod_tensor_arr) { garbages.emplace_back(t.MoveMemoryHolder()); } + } else if (var->IsType()) { } else { PADDLE_THROW(platform::errors::Unimplemented( "Type %s of variable %s is not supported eager deletion.", diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3bd85b2b24b97..2eac65c90c02f 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include #include "glog/logging.h" namespace paddle { @@ -35,9 +36,24 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, feed_inputs.resize(index + 1); } // shared data with input tensor - feed_inputs[index].ShareDataWith(input); + auto& val = BOOST_GET(LoDTensor, feed_inputs[index]); + val.ShareDataWith(input); // set lod - feed_inputs[index].set_lod(input.lod()); + val.set_lod(input.lod()); +} + +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedStringVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = *(g_feed_value->GetMutable()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index] = input; } FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index a52ef517c8b73..4c2f5b9796a22 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/string_array.h" namespace paddle { namespace framework { @@ -28,6 +29,9 @@ class Scope; void SetFeedVariable(Scope* scope, const LoDTensor& input, const std::string& var_name, size_t index); +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index); + FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 1996327fe82bc..12c111e58f58a 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -13,14 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { -using FeedType = LoDTensor; +using FeedType = boost::variant; using FeedList = std::vector; using FetchType = boost::variant; @@ -43,6 +46,13 @@ inline bool data_is_lod_tensor_array(const FetchType &data) { return false; } +inline bool data_is_string_tensor(const FeedType &data) { + if (data.type() == typeid(Strings)) { + return true; + } + return false; +} + static const char kFeedOpType[] = "feed"; static const char kFetchOpType[] = "fetch"; diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index eb72d9e1420dc..300d5f6e8fad1 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -147,6 +147,11 @@ message VarType { // in operators like nccl_op RAW = 17; TUPLE = 18; + + STRING = 25; + STRINGS = 26; + VOCAB = 27; + FEED_LIST = 28; } required Type type = 1; @@ -175,6 +180,10 @@ message VarType { message Tuple { repeated Type element_type = 1; } optional Tuple tuple = 7; + + optional TensorDesc string = 8; + optional TensorDesc strings = 9; + optional TensorDesc vocab = 10; } message VarDesc { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2a543d48791a3..0cd17cdb10d55 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -76,6 +76,8 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name, } else { return var->Get().GetCompleteDims(); } + } else if (var->IsType()) { + return DDim({static_cast(var->Get().size())}); } else { return DDim({-1}); } @@ -106,6 +108,8 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { } else { return DataTypeToString(tensor.type()); } + } else if (var->IsType()) { + return "strings"; } else { return ""; } diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc new file mode 100755 index 0000000000000..3071e6bf4cff3 --- /dev/null +++ b/paddle/fluid/framework/string_array.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace framework { + +std::wstring_convert> kConverter; + +// Convert the std::string type to the std::wstring type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res) { + try { + *res = kConverter.from_bytes(src); + } catch (std::range_error& e) { + VLOG(3) << "The string " << src << " was converted to unicode failedly! "; + return false; + } + return true; +} + +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res) { + *res = kConverter.to_bytes(src); +} + +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret) { + *ret = ""; + char* result = reinterpret_cast( + utf8proc_NFD(reinterpret_cast(s.c_str()))); + if (result) { + *ret = std::move(std::string(result)); + free(result); + } +} + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data) { + { + // firstly write the data size. + size_t t = data.size(); + os.write(reinterpret_cast(&t), sizeof(t)); + } + { + // then write the data + for (auto it = data.begin(); it != data.end(); ++it) { + std::string token = it->first; + int32_t token_id = it->second; + // write the token + size_t length = token.size(); + os.write(reinterpret_cast(&length), sizeof(length)); + os.write(token.c_str(), length); + // write the token_id + os.write(reinterpret_cast(&token_id), sizeof(token_id)); + } + } +} + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data) { + // first read the map size + size_t map_size; + is.read(reinterpret_cast(&map_size), sizeof(map_size)); + data->reserve(map_size); + // then read the data + for (size_t i = 0; i < map_size; ++i) { + // read the token + size_t token_length; + is.read(reinterpret_cast(&token_length), sizeof(token_length)); + char* tmp = new char[token_length]; + is.read(tmp, token_length); + std::string token(tmp, tmp + token_length); + delete[] tmp; + // read the token_id + int32_t token_id; + is.read(reinterpret_cast(&token_id), sizeof(token_id)); + + data->emplace(token, token_id); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h new file mode 100755 index 0000000000000..b874fbac4c9e7 --- /dev/null +++ b/paddle/fluid/framework/string_array.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +using String = std::string; +using Strings = std::vector; +using Vocab = std::unordered_map; + +// Convert the std::string type to the std::string type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res); +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res); +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret); + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data); + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index ee30a82aff6ef..1c43219330bfe 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/tensor_util.h" - #include #include #include @@ -22,6 +20,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index f4bbbaa2e70cf..73829898be961 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -13,11 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include +#include +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #ifdef PADDLE_WITH_ASCEND_CL @@ -48,6 +54,14 @@ class PrintOptions { PrintOptions() {} }; +void TensorToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx, + const size_t& seek, const std::vector& shape); + // NOTE(zcd): Because TensorCopy is an async operation, when the src_place // and dst_place are two different GPU, to ensure that the operation can // be carried out correctly, there is a src_ctx wait operation in TensorCopy. diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index c3bdd6ae7f135..41fe9fbbc0396 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -209,6 +209,10 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { return desc_.type().lod_tensor().tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.type().tensor_array().tensor(); + case proto::VarType::STRINGS: + return desc_.type().strings(); + case proto::VarType::VOCAB: + return desc_.type().vocab(); default: PADDLE_THROW(platform::errors::Unavailable( "Getting 'tensor_desc' is not supported by the %s type variable.", @@ -249,6 +253,10 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); + case proto::VarType::STRINGS: + return desc_.mutable_type()->mutable_strings(); + case proto::VarType::VOCAB: + return desc_.mutable_type()->mutable_vocab(); default: PADDLE_THROW( platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not " diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 473df85aa0421..c8c3cf364e0fc 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -18,10 +18,12 @@ #include #include #include +#include #include #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include @@ -162,8 +164,8 @@ struct VarTypeRegistryImpl { // Paddle would generate unique Ids for each registered variable types. using VarTypeRegistry = detail::VarTypeRegistryImpl< Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, - LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, - operators::reader::LoDTensorBlockingQueueHolder, FetchList, + Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *, + operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -177,8 +179,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif - int, float>; - + int, float, Vocab>; template struct VarTypeTrait { static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); @@ -208,9 +209,13 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST); REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST); REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); +REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB); +REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING); +REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS); /** End of variable type registration */ diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index bdcdd4e64e331..37ec5d7bc83bd 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -41,6 +42,10 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { var->GetMutable(); + } else if (var_type == proto::VarType::STRINGS) { + var->GetMutable(); + } else if (var_type == proto::VarType::VOCAB) { + var->GetMutable(); } else if (var_type == proto::VarType::PLACE_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 758e8e62718e7..9fbbe7d06f8ad 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -20,6 +20,7 @@ #include #include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/op_base.h" @@ -153,6 +154,15 @@ class VariableWrapper { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { tensor = &(var_.Get().value()); + } else if (type_ == framework::proto::VarType::VOCAB) { + const framework::Vocab* data = nullptr; + data = &(var_.Get()); + if (data && data->size() != 0) { + VLOG(6) << "The tensor of variable " << name_ + << " is not initialized"; + return data_type_; + } + return framework::proto::VarType::VOCAB; } else { VLOG(6) << "Variable " << name_ << " is not initialized"; return data_type_; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index bbec3eab1cadf..53b92c1336302 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -26,7 +26,7 @@ if(WITH_MKLDNN) set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE) endif() -cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer) +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc) cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 47abe3298aa7c..1fdc5cd730e53 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -34,12 +34,14 @@ include_directories("${PADDLE_LIB}/") set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") @@ -151,12 +153,13 @@ if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp + glog gflags protobuf xxhash cryptopp utf8proc ${EXTERNAL_LIB}) else() set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB}) + glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static + ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) endif(NOT WIN32) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index a9c6ef13177c2..bb537f0c65285 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -43,15 +43,33 @@ void Tensor::Reshape(const std::vector &shape) { tensor->Resize(paddle::framework::make_ddim(shape)); } -#define EAGER_GET_TENSOR \ - if (!tensor_) { \ - tensor_ = FindTensor(); \ - } \ - auto *tensor = static_cast(tensor_); +void Tensor::ReshapeStrings(const size_t &shape) { + PADDLE_ENFORCE_EQ( + name_.empty(), false, + paddle::platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + PADDLE_ENFORCE_EQ(input_or_output_, true, + paddle::platform::errors::PermissionDenied( + "Can't reshape the output tensor, it is readonly")); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE_NOT_NULL( + var, paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); + paddle_infer::Strings *tensor = var->GetMutable(); + tensor->resize(shape); +} + +#define EAGER_GET_TENSOR(tensor_type) \ + if (!tensor_) { \ + tensor_ = FindTensor(); \ + } \ + auto *tensor = static_cast(tensor_); template T *Tensor::mutable_data(PlaceType place) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GT( tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( @@ -83,7 +101,7 @@ T *Tensor::mutable_data(PlaceType place) { template T *Tensor::data(PlaceType *place, int *size) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto *res = tensor->data(); if (paddle::platform::is_cpu_place(tensor->place())) { @@ -103,7 +121,7 @@ T *Tensor::data(PlaceType *place, int *size) const { } DataType Tensor::type() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto type = tensor->type(); if (type == paddle::framework::proto::VarType::FP32) { return DataType::FLOAT32; @@ -125,7 +143,7 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( "You should call Tensor::Reshape(const " @@ -186,10 +204,20 @@ void Tensor::CopyFromCpu(const T *data) { } } +void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { + EAGER_GET_TENSOR(paddle_infer::Strings); + PADDLE_ENFORCE_GE(tensor->size(), 0, + paddle::platform::errors::PreconditionNotMet( + "You should call Tensor::Reshape(const " + "std::size_t &shape)function before copying" + "the string data from cpu.")); + *tensor = *data; +} + template void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, void *cb_params) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto ele_num = tensor->numel(); auto *t_data = tensor->data(); auto t_place = tensor->place(); @@ -371,6 +399,7 @@ Tensor::Tensor(void *scope) : scope_{scope} { "set to the pointer of scope.")); } +template void *Tensor::FindTensor() const { PADDLE_ENFORCE_EQ( name_.empty(), false, @@ -382,12 +411,12 @@ void *Tensor::FindTensor() const { PADDLE_ENFORCE_NOT_NULL( var, paddle::platform::errors::PreconditionNotMet( "No tensor called [%s] in the runtime scope", name_)); - auto *tensor = var->GetMutable(); + auto *tensor = var->GetMutable(); return tensor; } std::vector Tensor::shape() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( tensor_, paddle::platform::errors::PreconditionNotMet( "Not found tensor called %s in the scope", name_)); @@ -395,7 +424,7 @@ std::vector Tensor::shape() const { } void Tensor::SetLoD(const std::vector> &x) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); paddle::framework::LoD lod; for (auto &level : x) { lod.emplace_back(level); @@ -404,7 +433,7 @@ void Tensor::SetLoD(const std::vector> &x) { } std::vector> Tensor::lod() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); std::vector> res; for (auto &level : tensor->lod()) { res.emplace_back(level); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 1f1be13610379..eb134874c3aa8 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -36,7 +36,10 @@ template PD_INFER_DECL int64_t *Tensor::data(PlaceType *place, template float *Tensor::mutable_data(PlaceType place); template int64_t *Tensor::mutable_data(PlaceType place); -void *Tensor::FindTensor() const { return nullptr; } +template +void *Tensor::FindTensor() const { + return nullptr; +} std::vector Tensor::shape() const { return {}; } diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc index 0c092a8684d1a..4b6f90f3f0652 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc @@ -88,7 +88,8 @@ bool SetPlaceAndCheck(PlaceType place, size_t length) { const std::vector> lod{{0, length}}; scope.Var(name); auto tensor = CreateTensor(place, &scope, name); - tensor->Reshape({static_cast(length)}); + std::vector shape{static_cast(length)}; + tensor->Reshape(shape); tensor->mutable_data(place); tensor->SetLoD(lod); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index de6b28de27557..b137b7ba6f97e 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -174,6 +174,14 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { void copy_from_cpu(const T* data) { return CopyFromCpu(data); } + + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void copy_strings_from_cpu(const paddle_infer::Strings* data) { + return CopyStringsFromCpu(data); + } + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index f6dce74c30ded..24a72a0b9dadb 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -14,10 +14,16 @@ #pragma once +#include + #include "paddle_infer_declare.h" // NOLINT namespace paddle_infer { +/// \brief Experimental. +/// Strings for text data. +using Strings = std::vector; + typedef void (*CallbackFunc)(void*); #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) @@ -57,6 +63,14 @@ class PD_INFER_DECL Tensor { /// \param shape The shape to set. void Reshape(const std::vector& shape); + /// \brief Experimental interface. + /// Reset the shape of the Strings tensor. + /// Generally it's only used for the input tensor. + /// Reshape must be called before calling + /// ZeroCopyStringTensorCreate() or PaddleInferTensorCreate() + /// \param shape The shape to set. + void ReshapeStrings(const std::size_t& shape); + /// \brief Get the memory pointer in CPU or GPU with specific data type. /// Please Reshape the tensor first before call this. /// It's usually used to get input data pointer. @@ -78,6 +92,11 @@ class PD_INFER_DECL Tensor { template void CopyFromCpu(const T* data); + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void CopyStringsFromCpu(const paddle_infer::Strings* data); + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. @@ -122,7 +141,10 @@ class PD_INFER_DECL Tensor { protected: explicit Tensor(void* scope); + + template void* FindTensor() const; + void SetPlace(PlaceType place, int device = -1); void SetName(const std::string& name); diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index d2bc95e7c3eb3..f976e217bab1a 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -17,11 +17,13 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); @@ -85,10 +87,12 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); new_var->SetDataType(var->GetDataType()); - new_var->SetType(var->GetType()); + auto var_type = var->GetType(); + new_var->SetType(var_type); - if (var->GetType() != - framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) { + if ((var_type != + framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) && + (var_type != framework::proto::VarType::VOCAB)) { new_var->SetLoDLevel(var->GetLoDLevel()); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 78cbc7e8a583b..937bfea3a59ef 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) +add_subdirectory(string) add_subdirectory(jit) if(WITH_MKLDNN) add_subdirectory(mkldnn) @@ -78,10 +79,12 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op - sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op + recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) +op_library(save_combine_op DEPS string_array) +op_library(load_combine_op DEPS string_array) if (WITH_GPU OR WITH_ROCM) if(WITH_ROCM) diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 9597dd25ec530..bc29c92b09426 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,6 +26,39 @@ class OpBase; namespace paddle { namespace operators { + +// FeedVariableVisitor is to feed the variable data +// according to data type (LoDTensor or Strings). +class FeedVariableVisitor : public boost::static_visitor { + public: + explicit FeedVariableVisitor(framework::Variable *out_var, + const platform::Place &place) + : out_var_(out_var), place_(place) {} + + void operator()(const framework::LoDTensor &in_tensor) const { + framework::LoDTensor *out_tensor = + out_var_->GetMutable(); + if (platform::is_same_place(in_tensor.place(), place_)) { + out_tensor->ShareDataWith(in_tensor); + } else { + platform::DeviceContext *context = + platform::DeviceContextPool::Instance().Get(place_); + framework::TensorCopy(in_tensor, place_, *context, out_tensor); + } + out_tensor->set_lod(in_tensor.lod()); + } + + void operator()(const framework::Strings &in_str) const { + framework::Strings *out_str = out_var_->GetMutable(); + out_str->resize(in_str.size()); + *out_str = in_str; + } + + private: + framework::Variable *out_var_; + const platform::Place &place_; +}; + class FeedOp : public framework::OperatorBase { public: FeedOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -79,15 +109,9 @@ class FeedOp : public framework::OperatorBase { col, feed_list.size())); auto &feed_item = feed_list.at(static_cast(col)); - auto *out_item = out_var->GetMutable(); - if (platform::is_same_place(feed_item.place(), place)) { - out_item->ShareDataWith(feed_item); - } else { - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - framework::TensorCopy(feed_item, place, *dev_ctx, out_item); - } - out_item->set_lod(feed_item.lod()); + FeedVariableVisitor visitor(out_var, place); + boost::apply_visitor(visitor, feed_item); } }; @@ -95,17 +119,17 @@ class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(vector) A feeding list of LoDTensor, which may have " + "(vector) " + "A feeding list of LoDTensor, which may have " "different dimension and data type."); AddOutput("Out", - "(LoDTensor) The LoDTensor which is a copy of the col-th feeding " + "(LoDTensor) The LoDTensor which is a copy " + "of the col-th feeding " "object."); AddAttr("col", "(int) The column index of current feeding object."); AddComment(R"DOC( Feed Operator. - It should not be configured by users directly. - )DOC"); } }; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b48422d9..99b16d9b69253 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -109,6 +109,10 @@ class FetchOp : public framework::OperatorBase { auto &src_item = fetch_var->Get(); auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col))); DataCopy(src_item, fetch_var_name, dst_item); + } else if (fetch_var->IsType()) { + auto &src_item = fetch_var->Get(); + auto *dst_item = &(BOOST_GET(framework::Vocab, fetch_list->at(col))); + *dst_item = src_item; } else { auto &src_item = fetch_var->Get(); framework::LoDTensorArray tmp(src_item.size()); @@ -128,9 +132,11 @@ class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(LoDTensor) The resulted LoDTensor which is expected to return " "to users."); - AddOutput("Out", - "(vector) A fetching list of LoDTensor which may have " - "different dimension, shape and data type."); + AddOutput( + "Out", + "(vector|unordered_map) A fetching list" + " of LoDTensor|unordered_map which may have " + "different dimension, shape and data type."); AddAttr("col", "(int) The column index of fetching object."); AddComment(R"DOC( Fetch Operator. diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h index 589df8821b3e7..a02b0e61d9278 100644 --- a/paddle/fluid/operators/load_combine_op.h +++ b/paddle/fluid/operators/load_combine_op.h @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -75,38 +77,57 @@ class LoadCombineOpKernel : public framework::OpKernel { out_vars[i], platform::errors::InvalidArgument( "The variable %s to be loaded cannot be found.", out_var_names[i])); - - auto *tensor = out_vars[i]->GetMutable(); - // Error checking PADDLE_ENFORCE_EQ( static_cast(*buffer), true, platform::errors::Unavailable( "An error occurred while loading model parameters. " "Please check whether the model file is complete or damaged.")); - - // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, dev_ctx); - - auto in_dtype = tensor->type(); - auto out_dtype = - load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - out_vars[i]->Clear(); - tensor = out_vars[i]->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); + if (out_vars[i]->IsType()) { + auto *tensor = out_vars[i]->GetMutable(); + tensor->clear(); + std::unordered_map data; + framework::StringMapFromStream(*buffer, &data); + for (auto it = data.begin(); it != data.end(); ++it) { + std::string tmp; + framework::NFD(it->first, &tmp); + if (tmp.empty()) { + VLOG(0) << "The string " << it->first + << " was converted to unicode failedly! " + << "Then dropped to load it."; + continue; + } + std::wstring token; + bool status = framework::ConvertStrToWstr(tmp, &token); + if (!status) continue; + tensor->emplace(token, it->second); + } + } else { + auto *tensor = out_vars[i]->GetMutable(); + + // Get data from fin to tensor + DeserializeFromStream(*buffer, tensor, dev_ctx); + + auto in_dtype = tensor->type(); + auto out_dtype = + load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + out_vars[i]->Clear(); + tensor = out_vars[i]->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } } } buffer->peek(); diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 939768693a243..6e6c826a22892 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -19,11 +19,13 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/port.h" @@ -66,34 +68,48 @@ class SaveCombineOpKernel : public framework::OpKernel { inp_vars[i], platform::errors::InvalidArgument("Cannot find variable %s to save.", inp_var_names[i])); - PADDLE_ENFORCE_EQ(inp_vars[i]->IsType(), true, + PADDLE_ENFORCE_EQ(inp_vars[i]->IsType() || + inp_vars[i]->IsType(), + true, platform::errors::InvalidArgument( "SaveCombine operator only supports saving " - "LoDTensor variable, %s has wrong type.", + "LoDTensor or Vocab variable, %s has wrong type.", inp_var_names[i])); - auto &tensor = inp_vars[i]->Get(); - PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "The Tensor of Variable(%s) to be saved is not initialized.", - inp_var_names[i])); - // Serialize tensors one by one - // Check types to see if a fp16 transformation is required - auto in_dtype = tensor.type(); - auto out_dtype = - save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + if (inp_vars[i]->IsType()) { + auto &tensor = inp_vars[i]->Get(); + PADDLE_ENFORCE_EQ( + tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "The Tensor of Variable(%s) to be saved is not initialized.", + inp_var_names[i])); + // Serialize tensors one by one + // Check types to see if a fp16 transformation is required + auto in_dtype = tensor.type(); + auto out_dtype = + save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(ss, out, dev_ctx); + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, + &out); + framework::SerializeToStream(ss, out, dev_ctx); + } else { + framework::SerializeToStream(ss, tensor, dev_ctx); + } } else { - framework::SerializeToStream(ss, tensor, dev_ctx); + auto &tensor = inp_vars[i]->Get(); + std::unordered_map data; + for (auto it = tensor.begin(); it != tensor.end(); ++it) { + std::string t; + framework::ConvertWstrToStr(it->first, &t); + data.emplace(t, it->second); + } + framework::StringMapToStream(ss, data); } } if (save_to_memory) { diff --git a/paddle/fluid/operators/string/CMakeLists.txt b/paddle/fluid/operators/string/CMakeLists.txt new file mode 100644 index 0000000000000..1da2e8e455da0 --- /dev/null +++ b/paddle/fluid/operators/string/CMakeLists.txt @@ -0,0 +1,6 @@ +include(operators) +if(WITH_UNITY_BUILD) + # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops. + include(unity_build_rule.cmake) +endif() +register_operators(DEPS op_version_registry utf8proc string_array) diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc new file mode 100644 index 0000000000000..49457af8f00c8 --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -0,0 +1,524 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/operators/string/faster_tokenizer_op.h" + +namespace paddle { +namespace operators { + +using std::bad_cast; +using std::codecvt_utf8; +using std::endl; +using std::exception; +using std::ifstream; +using std::int64_t; +using std::min; +using std::runtime_error; +using std::unordered_map; +using std::unordered_set; +using std::shared_ptr; +using std::size_t; +using std::int64_t; +using std::string; +using std::vector; +using std::wstring; + +const wstring kStripChars = L" \t\n\r\v\f"; + +inline bool IsControl(const wchar_t& ch) { + if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true; + return false; +} + +inline bool IsChineseChar(const wchar_t& ch) { + if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) || + (ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) || + (ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) || + (ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F)) + return true; + return false; +} + +inline bool IsWhiteSpace(const wchar_t& ch) { + if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_ZS) return true; + return false; +} + +inline bool IsPunctuation(const wchar_t& ch) { + if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) || + (ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126)) + return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS || + cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC || + cat == UTF8PROC_CATEGORY_PO // sometimes Ā¶ belong SO + || cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF) + return true; + return false; +} + +BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */) + : do_lower_case_(do_lower_case) {} + +wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const { + wchar_t new_ch = utf8proc_tolower(ch); + return new_ch; +} + +void BasicTokenizer::Tokenize(const string& text, vector* res) const { + std::wstring unicode_text; + bool status = framework::ConvertStrToWstr(text, &unicode_text); + if (!status) { + // String is converted into wstring failedly. + return; + } + + std::wstring dest_text; + for (auto ch : unicode_text) { + if (ch == 0 || ch == 0xfffd || IsControl(ch)) { + continue; + } + if (do_lower_case_) { + ch = do_lower_case(ch); + } + if (IsChineseChar(ch) || IsPunctuation(ch)) { + dest_text += ' '; + dest_text += ch; + dest_text += ' '; + } else if (IsWhiteSpace(ch)) { + dest_text += ' '; + } else { + dest_text += ch; + } + } + boost::split(*res, dest_text, boost::is_any_of(kStripChars)); +} + +WordPieceTokenizer::WordPieceTokenizer( + framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/, + const size_t max_input_chars_per_word /* = 100 */) + : vocab_(vocab), + unk_token_(unk_token), + max_input_chars_per_word_(max_input_chars_per_word) { + unk_token_id_ = (*vocab_)[unk_token_]; +} + +void WordPieceTokenizer::Tokenize(const wstring& text, + vector* token_ids) const { + size_t len = text.size(); + if (len > max_input_chars_per_word_) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } + + auto it = vocab_->find(text); + if (it != vocab_->end()) { + token_ids->emplace_back(std::move(it->second)); + return; + } + + size_t start = 0; + vector wordpiece_ids; + while (start < len) { + size_t end = len; + std::wstring cur_substr; + int64_t cur_substr_id; + while (start < end) { + std::wstring sub = text.substr(start, end - start); + if (start > 0) { + sub = L"##" + sub; + } + auto it = vocab_->find(sub); + if (it != vocab_->end()) { + cur_substr = sub; + cur_substr_id = it->second; + break; + } + end -= 1; + } + + if (cur_substr.empty()) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } else { + start = end; + wordpiece_ids.emplace_back(std::move(cur_substr_id)); + } + } + for (auto& token_id : wordpiece_ids) { + token_ids->emplace_back(std::move(token_id)); + } +} + +BertTokenizer::BertTokenizer(framework::Vocab* vocab, + bool do_lower_case /* = false */, + const wstring& unk_token /* = L"[UNK]" */, + const wstring& pad_token /* = L"[PAD]" */, + const wstring& cls_token /* = L"[CLS]" */, + const wstring& mask_token /* = L"[MASK]" */, + const wstring& sep_token /* = L"[SEP]" */, + const string& padding_site /* = "right" */) + : do_lower_case_(do_lower_case), + unk_token_(unk_token), + pad_token_(pad_token), + cls_token_(cls_token), + mask_token_(mask_token), + sep_token_(sep_token), + padding_site_(padding_site), + vocab_(vocab), + basic_tokenizer_(do_lower_case_), + word_piece_tokenizer_(vocab_, unk_token) { + unk_token_id_ = (*vocab_)[unk_token_]; + pad_token_id_ = (*vocab_)[pad_token_]; + cls_token_id_ = (*vocab_)[cls_token_]; + mask_token_id_ = (*vocab_)[mask_token_]; + sep_token_id_ = (*vocab_)[sep_token_]; + + all_special_tokens_ = vector( + {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_}); + all_special_token_ids_ = + unordered_set({unk_token_id_, pad_token_id_, cls_token_id_, + mask_token_id_, sep_token_id_}); +} + +void BertTokenizer::Tokenize(const string& text, + vector* split_token_ids) const { + std::vector tmp_tokens; + basic_tokenizer_.Tokenize(text, &tmp_tokens); + if (tmp_tokens.empty()) return; + split_token_ids->reserve(tmp_tokens.size()); + for (auto& w_token : tmp_tokens) { + const auto& vec_size = w_token.size(); + if (vec_size == 1) { + if (IsChineseChar(w_token[0])) { + auto vocab_it = vocab_->find(w_token); + if (vocab_it != vocab_->end()) { + split_token_ids->emplace_back(std::move(vocab_it->second)); + } else { + split_token_ids->emplace_back(std::move(unk_token_id_)); + } + } else { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } + } else if (vec_size > 1) { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } else { + continue; + } + } +} + +void BertTokenizer::BuildInputsWithSpecialTokens( + vector* inputs, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + inputs->clear(); + inputs->resize(token_ids_0.size() + 2); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } else { + inputs->clear(); + inputs->resize(token_ids_0.size() + token_ids_1.size() + 3); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + ++i; + for (auto& token_id : token_ids_1) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } +} + +int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const { + if (pair) { + return 3; + } else { + return 2; + } +} + +void BertTokenizer::CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + vector tmp(token_ids_0.size() + 2, 0); + token_type_ids->swap(tmp); + } else { + vector tmp(token_ids_0.size() + token_ids_1.size() + 3, 0); + for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) { + tmp[i] = 1; + } + token_type_ids->swap(tmp); + } +} + +void BertTokenizer::TruncateSequence( + vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove /* = 0 */, + const size_t stride /* = 0 */) const { + for (size_t i = 0; i < num_tokens_to_remove; i++) { + if ((pair_ids->size() == 0) || (ids->size() > pair_ids->size())) { + ids->pop_back(); + } else { + pair_ids->pop_back(); + } + } +} + +int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; } + +int BertTokenizer::Encode( + unordered_map>* encoded_inputs, const string& text, + const string& text_pair /* = "" */, bool is_split_into_words /* = false */, + const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + vector ids; + vector pair_ids; + if (!is_split_into_words) { + Tokenize(text, &ids); + if (ids.empty()) return 0; + if (text_pair != "") { + Tokenize(text_pair, &pair_ids); + if (pair_ids.empty()) return 0; + } + } else { + std::wstring unicode_text; + bool status_a = framework::ConvertStrToWstr(text, &unicode_text); + if (!status_a) { + return 0; + } + for (size_t i = 0; i < unicode_text.size(); i++) { + wstring token = unicode_text.substr(i, 1); + auto it = vocab_->find(token); + if (it != vocab_->end()) { + ids.emplace_back(std::move(it->second)); + } else { + ids.emplace_back(std::move(unk_token_id_)); + } + } + } + + bool pair = false; + if (pair_ids.size() != 0) { + pair = true; + } + + size_t len_ids = ids.size(); + size_t len_pair_ids = pair_ids.size(); + + // Truncation: Handle max sequence length + // If max_seq_len == 0, then do nothing and keep the real length. + // If max_seq_len > 0 and + // all the input sequence len is over the max_seq_len, + // then we truncate it. + size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair); + if (max_seq_len > 0 && total_len > max_seq_len) { + TruncateSequence(&ids, &pair_ids, total_len - max_seq_len); + } + + // Add special tokens + vector sequence; + BuildInputsWithSpecialTokens(&sequence, ids, pair_ids); + size_t seq_len = sequence.size(); + vector token_type_ids; + CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids); + + // Build output dictionnary + encoded_inputs->emplace("input_ids", sequence); + encoded_inputs->emplace("token_type_ids", token_type_ids); + // Check lengths + if (max_seq_len > 0 && seq_len > max_seq_len) { + VLOG(3) << "There is something wrong with the input sequence length." + " Please check it."; + // Failed. + return 0; + } + + // Padding + bool needs_to_be_padded = false; + if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) { + needs_to_be_padded = true; + } + + if (needs_to_be_padded) { + int64_t difference = max_seq_len - seq_len; + size_t pad_start = max_seq_len - 1 - difference; + encoded_inputs->at("token_type_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("token_type_ids")[i] = pad_token_id_; + } + + encoded_inputs->at("input_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("input_ids")[i] = pad_token_id_; + } + } + return 1; +} + +void BertTokenizer::BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair /* = vector() */, + bool is_split_into_words /* = false */, const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + bool has_text_pair = false; + if (batch_text_pair.size() != 0) { + has_text_pair = true; + } + + size_t batch_size = batch_text.size(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (size_t i = 0; i < batch_size; i++) { + unordered_map> res; + if (has_text_pair) { + auto status = + Encode(&res, batch_text[i], batch_text_pair[i], is_split_into_words, + max_seq_len, pad_to_max_seq_len); + if (!status) { + res["input_ids"] = + std::vector{cls_token_id_, sep_token_id_, cls_token_id_}; + res["token_type_ids"] = std::vector{0, 0, 1}; + } + } else { + auto status = Encode(&res, batch_text[i], {}, is_split_into_words, + max_seq_len, pad_to_max_seq_len); + + if (!status) { + res["input_ids"] = std::vector{cls_token_id_, sep_token_id_}; + res["token_type_ids"] = std::vector{0, 0}; + } + } + batch_encode_inputs->at(i) = std::move(res); + } +} + +class FasterTokenizerOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Text"), "Input", "Text", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasInput("Vocab"), "Input", "Vocab", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("InputIds"), "Output", "InputIds", + "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("SegmentIds"), "Output", "SegmentIds", + "Tokenizer"); + + ctx->SetOutputDim("InputIds", {-1, -1}); + ctx->SetOutputDim("SegmentIds", {-1, -1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::INT64, + paddle::platform::CPUPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + expected_kernel_type.place_, + tensor.layout()); + } +}; + +class FasterTokenizerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Vocab", + "(std::map), The vocab to map " + "token string to token id."); + AddInput("Text", + "(std::vector), The sequence to be processed. " + "One sequence is a string, a list of strings, " + "or a list of integers depending on whether it " + "has been pretokenized and converted to ids. "); + AddInput("TextPair", + "(std::vector), Same as `text` argument, " + "while it represents for the latter sequence of the " + "sequence pair.") + .AsDispensable(); + AddOutput("InputIds", "(Tensor), The token ids of the input text."); + AddOutput("SegmentIds", "(Tensor), The segments ids of the input text."); + AddAttr( + "do_lower_case", + "(bool), Whether or not to lowercase the input when tokenizing.") + .SetDefault(false); + AddAttr( + "is_split_into_words", + "(bool), Whether or not the input is already pre-tokenized " + "(e.g., split into words). If set to True, the tokenizer " + "assumes the input is already split into words (for instance, " + "by splitting it on whitespace) which it will tokenize. This " + "is useful for NER or token classification.") + .SetDefault(false); + AddAttr("max_seq_len", + "(int), If set to a positive number, will limit the " + "total sequence returned so that it has a maximum length." + " If there are overflowing tokens, those overflowing " + "tokens will be added to the returned dictionary when " + "`return_overflowing_tokens` is `True`.") + .SetDefault(0); + AddAttr("pad_to_max_seq_len", + "(bool), If set to `True`, the returned sequences would be" + " padded up to `max_seq_len` specified length according to" + " padding side and padding token id.") + .SetDefault(false); + AddComment(R"DOC(Performs tokenization and uses the tokenized tokens to " + "prepare model inputs. It supports sequence or sequence pair as input, " + "and batch input is not allowed.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(faster_tokenizer, ops::FasterTokenizerOp, + ops::FasterTokenizerOpMaker); + +REGISTER_OP_CPU_KERNEL(faster_tokenizer, ops::FasterTokenizerKernel); diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h new file mode 100755 index 0000000000000..d9b7fa26a6704 --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.h @@ -0,0 +1,196 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace operators { + +using std::endl; +using std::int64_t; +using std::size_t; +using std::string; +using std::shared_ptr; +using std::vector; +using std::unordered_map; +using std::unordered_set; +using std::vector; +using std::wstring; +using std::wcout; + +inline bool IsControl(const wchar_t& ch); +inline bool IsChineseChar(const wchar_t& ch); +inline bool IsWhiteSpace(const wchar_t& ch); + +using Vocab = unordered_map; +using InvVocab = unordered_map; + +class BasicTokenizer { + public: + explicit BasicTokenizer(bool do_lower_case = true); + void Tokenize(const string& text, vector* res) const; + + private: + wchar_t do_lower_case(wchar_t ch) const; + + bool do_lower_case_; +}; + +class WordPieceTokenizer { + public: + explicit WordPieceTokenizer(framework::Vocab* vocab, + const wstring& unk_token = L"[UNK]", + const size_t max_input_chars_per_word = 100); + void Tokenize(const wstring& text, vector* output) const; + + private: + framework::Vocab* vocab_; + wstring unk_token_{L"[UNK]"}; + int64_t unk_token_id_; + size_t max_input_chars_per_word_; +}; + +class BertTokenizer { + public: + explicit BertTokenizer(framework::Vocab* vocab, bool do_lower_case = false, + const wstring& unk_token = L"[UNK]", + const wstring& pad_token = L"[PAD]", + const wstring& cls_token = L"[CLS]", + const wstring& mask_token = L"[MASK]", + const wstring& sep_token = L"[SEP]", + const string& padding_site = "right"); + + void Tokenize(const string& text, vector* split_tokens) const; + void BuildInputsWithSpecialTokens( + vector* res, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void TruncateSequence(vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove = 0, + const size_t stride = 0) const; + int64_t GetNumSpecialTokensToAdd(const bool pair = false) const; + int Encode(unordered_map>* encoded_inputs, + const string& text, const string& text_pair = "", + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + void BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair = vector(), + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + + int64_t GetPadTokenID() const; + + private: + bool do_lower_case_; + wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_; + string padding_site_; + framework::Vocab* vocab_; + BasicTokenizer basic_tokenizer_; + WordPieceTokenizer word_piece_tokenizer_; + int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_, + sep_token_id_; + vector all_special_tokens_; + unordered_set all_special_token_ids_; + InvVocab inv_vocab_; +}; + +template +class FasterTokenizerKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* text = ctx.Input("Text"); + auto* vocab = ctx.Input("Vocab"); + + auto* input_ids = ctx.Output("InputIds"); + auto* seg_ids = ctx.Output("SegmentIds"); + + auto do_lower_case = static_cast(ctx.Attr("do_lower_case")); + auto is_split_into_words = + static_cast(ctx.Attr("is_split_into_words")); + auto max_seq_len = static_cast(ctx.Attr("max_seq_len")); + auto pad_to_max_seq_len = + static_cast(ctx.Attr("pad_to_max_seq_len")); + + auto* text_pair = ctx.Input("TextPair"); + if (text_pair && text->size() != text_pair->size()) { + VLOG(3) << "The input text(list[str]) and text pair (list[str]) must" + << "be the same number of text sequence. Please check the input!"; + return; + } + + BertTokenizer* tokenizer_ptr = + new BertTokenizer(const_cast(vocab), do_lower_case); + size_t batch_max_seq_len = 0; + size_t batch_size = text->size(); + + vector>> batch_encode_inputs( + batch_size); + if (text_pair) { + tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, *text_pair, + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } else { + tokenizer_ptr->BatchEncode(&batch_encode_inputs, *text, vector(), + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } + + for (size_t i = 0; i < batch_size; ++i) { + size_t seq_len = batch_encode_inputs[i]["input_ids"].size(); + if (seq_len > batch_max_seq_len) { + batch_max_seq_len = seq_len; + } + } + + input_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* input_ids_data = input_ids->mutable_data(ctx.GetPlace()); + seg_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* seg_ids_data = seg_ids->mutable_data(ctx.GetPlace()); + + auto pad_token_id = tokenizer_ptr->GetPadTokenID(); + for (size_t i = 0; i < batch_size; i++) { + auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"]; + auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"]; + const size_t& seq_len = encoder_input_ids.size(); + // Copy the memory + std::memcpy(input_ids_data + i * batch_max_seq_len, + encoder_input_ids.data(), seq_len * sizeof(T)); + std::memcpy(seg_ids_data + i * batch_max_seq_len, encoder_seg_ids.data(), + seq_len * sizeof(T)); + std::memset(input_ids_data + i * batch_max_seq_len + seq_len, + pad_token_id, (batch_max_seq_len - seq_len) * sizeof(T)); + std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id, + (batch_max_seq_len - seq_len) * sizeof(T)); + } + delete tokenizer_ptr; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/string/unity_build_rule.cmake b/paddle/fluid/operators/string/unity_build_rule.cmake new file mode 100644 index 0000000000000..a4b209d2df13e --- /dev/null +++ b/paddle/fluid/operators/string/unity_build_rule.cmake @@ -0,0 +1,8 @@ +# This file records the Unity Build compilation rules. +# The source files in a `register_unity_group` called are compiled in a unity +# file. +# Generally, the combination rules in this file do not need to be modified. +# If there are some redefined error in compiling with the source file which +# in combination rule, you can remove the source file from the following rules. +register_unity_group(cc + faster_tokenizer_op.cc) \ No newline at end of file diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index f94afaa56b8df..8b01f02ee2c3a 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1875,6 +1875,12 @@ void BindImperative(py::module *m_ptr) { } else if (self.Var().IsType()) { return framework::vectorize( self.Var().Get().value().dims()); + } else if (self.Var().IsType()) { + return std::vector{static_cast( + self.Var().Get().size())}; + } else if (self.Var().IsType()) { + return std::vector{ + static_cast(self.Var().Get().size())}; } else { VLOG(2) << "It is meaningless to get shape of " "variable type " diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index e02f25ff636a2..5193724ecedf5 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -185,6 +185,18 @@ void ZeroCopyTensorCreate( tensor.copy_from_cpu(static_cast(data.data())); } +/// \brief Experimental interface. +/// Create the Strings tensor from data. +/// \param tensor The tensor will be created and +/// the tensor value is same as data. +/// \param data The input text. +void ZeroCopyStringTensorCreate(ZeroCopyTensor &tensor, // NOLINT + const paddle_infer::Strings *data) { + size_t shape = data->size(); + tensor.ReshapeStrings(shape); + tensor.copy_strings_from_cpu(data); +} + template void PaddleInferTensorCreate( paddle_infer::Tensor &tensor, // NOLINT @@ -195,6 +207,19 @@ void PaddleInferTensorCreate( tensor.CopyFromCpu(static_cast(data.data())); } +/// \brief Experimental interface. +/// Create the Strings tensor from data. +/// \param tensor The tensor will be created and +/// the tensor value is same as data. +/// \param data The input text. +void PaddleInferStringTensorCreate(paddle_infer::Tensor &tensor, // NOLINT + const paddle_infer::Strings *data) { + VLOG(3) << "Create PaddleInferTensor, dtype = Strings "; + size_t shape = data->size(); + tensor.ReshapeStrings(shape); + tensor.CopyStringsFromCpu(data); +} + size_t PaddleGetDTypeSize(PaddleDType dt) { size_t size{0}; switch (dt) { @@ -726,11 +751,15 @@ void BindPaddleInferPredictor(py::module *m) { void BindZeroCopyTensor(py::module *m) { py::class_(*m, "ZeroCopyTensor") - .def("reshape", &ZeroCopyTensor::Reshape) + .def("reshape", py::overload_cast &>( + &ZeroCopyTensor::Reshape)) + .def("reshape", py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) .def("copy_from_cpu", &ZeroCopyTensorCreate) + .def("copy_from_cpu", &ZeroCopyStringTensorCreate) .def("copy_to_cpu", &ZeroCopyTensorToNumpy) .def("shape", &ZeroCopyTensor::shape) .def("set_lod", &ZeroCopyTensor::SetLoD) @@ -740,12 +769,16 @@ void BindZeroCopyTensor(py::module *m) { void BindPaddleInferTensor(py::module *m) { py::class_(*m, "PaddleInferTensor") - .def("reshape", &paddle_infer::Tensor::Reshape) + .def("reshape", py::overload_cast &>( + &paddle_infer::Tensor::Reshape)) + .def("reshape", py::overload_cast( + &paddle_infer::Tensor::ReshapeStrings)) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) .def("copy_from_cpu_bind", &PaddleInferTensorCreate) + .def("copy_from_cpu_bind", &PaddleInferStringTensorCreate) .def("copy_to_cpu", &PaddleInferTensorToNumpy) .def("shape", &paddle_infer::Tensor::shape) .def("set_lod", &paddle_infer::Tensor::SetLoD) diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 01d101909b549..d031709b76581 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -68,6 +68,7 @@ std::map> op_ins_map = { {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"run_program", {"X", "Params"}}, + {"faster_tokenizer", {"Text", "Vocab", "TextPair"}}, {"matrix_rank", {"X", "TolTensor"}}, {"adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 99607d7f9750f..984f3d1a31cce 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -227,7 +227,10 @@ void BindVarDsec(pybind11::module *m) { .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) - .value("RAW", pd::proto::VarType::RAW); + .value("RAW", pd::proto::VarType::RAW) + .value("STRING", pd::proto::VarType::STRING) + .value("STRINGS", pd::proto::VarType::STRINGS) + .value("VOCAB", pd::proto::VarType::VOCAB); } void BindOpDesc(pybind11::module *m) { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f58c2a5db381c..529e7c6dab8ce 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1239,6 +1239,18 @@ All parameter, weight, gradient are variables in Paddle. [](Variable &self) { return py::bytes(*self.GetMutable()); }) + .def("set_string_list", + [](Variable &self, Strings str_list) { + *self.GetMutable() = str_list; + }) + .def("set_vocab", [](Variable &self, + Vocab vocab) { *self.GetMutable() = vocab; }) + .def("get_string_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) + .def("get_map_tensor", + [](Variable &self) { return self.GetMutable(); }, + py::return_value_policy::reference) .def("get_lod_rank_table", [](Variable &self) { return self.GetMutable(); }, py::return_value_policy::reference) @@ -1872,20 +1884,20 @@ All parameter, weight, gradient are variables in Paddle. .def("__str__", string::to_string); py::class_(m, "Operator") - .def_static("create", - [](py::bytes protobin) { - proto::OpDesc desc; - PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), - true, - platform::errors::InvalidArgument( - "Cannot parse user input to OpDesc")); - PADDLE_ENFORCE_EQ(desc.IsInitialized(), true, - platform::errors::InvalidArgument( - "The provided OpDesc is not " - "initialized, the reason is: %s", - desc.InitializationErrorString())); - return OpRegistry::CreateOp(desc); - }) + .def_static( + "create", + [](py::bytes protobin) { + proto::OpDesc desc; + PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true, + platform::errors::InvalidArgument( + "Cannot parse user input to OpDesc")); + PADDLE_ENFORCE_EQ( + desc.IsInitialized(), true, + platform::errors::InvalidArgument( + "The provided OpDesc is not initialized, the reason is: %s", + desc.InitializationErrorString())); + return OpRegistry::CreateOp(desc); + }) .def("run", [](OperatorBase &self, const Scope &scope, const platform::CPUPlace &place) { @@ -2139,7 +2151,12 @@ All parameter, weight, gradient are variables in Paddle. }); #endif - m.def("set_feed_variable", framework::SetFeedVariable); + m.def("set_feed_variable", + static_cast(&framework::SetFeedVariable)); + m.def("set_feed_variable", + static_cast(&framework::SetFeedVariable)); m.def("get_fetch_variable", [](const Scope &scope, const std::string &var_name, size_t index) -> py::object { diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index d41c373bf5093..2db9fb5d76a58 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -799,12 +799,17 @@ def fun(inputs): # 3. share parameters from Layer to scope & record var info for param_or_buffer in concrete_program.parameters: # share to scope - param_or_buffer_tensor = scope.var( - param_or_buffer.name).get_tensor() - #src_tensor = param_or_buffer.value().get_tensor() - src_tensor = state_var_dict[param_or_buffer.name].value( - ).get_tensor() - param_or_buffer_tensor._share_data_with(src_tensor) + if param_or_buffer.type == core.VarDesc.VarType.VOCAB: + scr_tensor = param_or_buffer.value().get_map_tensor() + tgt_var = scope.var(param_or_buffer.name) + tgt_var.set_vocab(scr_tensor) + else: + param_or_buffer_tensor = scope.var( + param_or_buffer.name).get_tensor() + #src_tensor = param_or_buffer.value().get_tensor() + src_tensor = state_var_dict[param_or_buffer.name].value( + ).get_tensor() + param_or_buffer_tensor._share_data_with(src_tensor) # record var info if param_or_buffer.name not in extra_var_info: extra_info_dict = dict() diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index e4b6bc0103426..694f9dc25e80c 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1409,13 +1409,22 @@ def _check_match(key, param): if state is None: raise ValueError("{} is not found in the provided dict.".format( key)) - state_shape = state.shape() if inspect.ismethod( - state.shape) else state.shape - if list(state_shape) != list(param.shape): - raise ValueError( - "{} receives a shape {}, but the expected shape is {}.". - format(key, list(state_shape), list(param.shape))) - return param, state + if (isinstance(state, dict) or isinstance(state, list)): + if (len(state) != len(param)): + raise ValueError("{} receieves the length of {}, " + "but the expected shape is {}".format( + key, len(state), len(param))) + else: + return param, state + else: + state_shape = state.shape() if inspect.ismethod( + state.shape) else state.shape + + if list(state_shape) != list(param.shape): + raise ValueError( + "{} receives a shape {}, but the expected shape is {}.". + format(key, list(state_shape), list(param.shape))) + return param, state matched_param_state = [] for key, param in self.state_dict().items(): diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py index b92e54d4868df..3731976ad18ab 100644 --- a/python/paddle/fluid/dygraph/math_op_patch.py +++ b/python/paddle/fluid/dygraph/math_op_patch.py @@ -133,7 +133,12 @@ def _int_(var): return int(var.numpy().flatten()[0]) def _len_(var): - return var.shape[0] + if var.type == core.VarDesc.VarType.VOCAB: + return len(var.value().get_map_tensor()) + elif var.type == core.VarDesc.VarType.STRINGS: + return len(var.value().get_string_tensor()) + else: + return var.shape[0] def _index_(var): numel = np.prod(var.shape) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 9d8b1500d5b02..e2fd36448ba65 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -146,25 +146,35 @@ def set_value(self, value): out = linear(t) # call with different weight """ - assert isinstance(value, (np.ndarray, core.VarBase)), \ - "Variable set_value function, arguments type only support Variable, numpy, VarBase" - - value_np = value - if isinstance(value, core.VarBase): - value_np = value.numpy() + assert isinstance(value, (np.ndarray, core.VarBase, dict, str)), \ + "Variable set_value function, arguments type only support Variable, numpy, VarBase, dict, string." + + if isinstance(value, (dict, str)): + assert len(self) == len( + value + ), "Variable length not match, Variable [ {} ] need tensor with length {} but load set tensor with length {}".format( + self.name, len(self), len(value)) + if isinstance(value, dict): + self.value().set_vocab(value) + else: + self.value().set_string_list(value) + else: + value_np = value + if isinstance(value, core.VarBase): + value_np = value.numpy() - self_tensor_np = self.numpy() + self_tensor_np = self.numpy() - assert self_tensor_np.shape == value_np.shape, \ - "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format( - self.name, self_tensor_np.shape, value_np.shape) + assert self_tensor_np.shape == value_np.shape, \ + "Variable Shape not match, Variable [ {} ] need tensor with shape {} but load set tensor with shape {}".format( + self.name, self_tensor_np.shape, value_np.shape) - assert self_tensor_np.dtype == value_np.dtype, \ - "Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( - self.name, self_tensor_np.dtype, value_np.dtype) + assert self_tensor_np.dtype == value_np.dtype, \ + "Variable dtype not match, Variable [ {} ] need tensor with dtype {} but load tensor with dtype {}".format( + self.name, self_tensor_np.dtype, value_np.dtype) - self.value().get_tensor().set(value_np, - framework._current_expected_place()) + self.value().get_tensor().set(value_np, + framework._current_expected_place()) @framework.dygraph_only def backward(self, grad_tensor=None, retain_graph=False): diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 17f8a7291ad8f..6fba200f54099 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -792,9 +792,11 @@ def _feed_data(self, program, feed, feed_var_name, scope): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] var = global_block.var(feed_target_name) - if not isinstance(cur_feed, core.LoDTensor): - cur_feed = _as_lodtensor(cur_feed, self.place, var.dtype) - check_feed_shape_type(var, cur_feed) + if var.dtype != core.VarDesc.VarType.STRINGS: + if not isinstance(cur_feed, core.LoDTensor): + cur_feed = _as_lodtensor(cur_feed, self.place, + var.dtype) + check_feed_shape_type(var, cur_feed) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 60e00238f6cc9..a3cd34c32ebbf 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -979,6 +979,10 @@ def __init__(self, if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) + if dtype == core.VarDesc.VarType.STRINGS: + type = core.VarDesc.VarType.STRINGS + lod_level = None + self.belong_to_optimizer = belong_to_optimizer self.error_clip = error_clip diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py index 2c1b2c77504d9..6576ca785b6e1 100644 --- a/python/paddle/fluid/inference/wrapper.py +++ b/python/paddle/fluid/inference/wrapper.py @@ -29,10 +29,14 @@ def tensor_copy_from_cpu(self, data): ''' Support input type check based on tensor.copy_from_cpu. ''' - if not isinstance(data, np.ndarray): + if isinstance(data, np.ndarray) or (isinstance(data, list) and + len(data) > 0 and + isinstance(data[0], str)): + self.copy_from_cpu_bind(data) + else: raise TypeError( - "In copy_from_cpu, we only support numpy ndarray data type.") - self.copy_from_cpu_bind(data) + "In copy_from_cpu, we only support numpy ndarray and list[str] data type." + ) Tensor.copy_from_cpu = tensor_copy_from_cpu diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py new file mode 100755 index 0000000000000..496f3505ec41b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py @@ -0,0 +1,393 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import io +import os +import unittest + +import numpy as np +import paddle +import paddle.nn as nn +from paddle.dataset.common import DATA_HOME +from paddle.fluid.framework import core, in_dygraph_mode +from paddle.fluid.layer_helper import LayerHelper + +import sys +sys.path.append("./tokenizer") +from tokenizer.bert_tokenizer import BertTokenizer + + +def to_string_tensor(string_values, name): + """ + Create the tensor that the value holds the list of string. + NOTICE: The value will be holded in the cpu place. + + Args: + string_values(list[string]): The value will be setted to the tensor. + name(string): The name of the tensor. + """ + tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name, + core.VarDesc.VarType.STRINGS, False) + tensor.value().set_string_list(string_values) + return tensor + + +def to_map_tensor(string_dict, name): + """ + Create the tensor that the value holds the map, the type of key is the string + and the value is the int. + NOTICE: The value will be holded in the cpu place. + + Args: + string_dict(dict): The value will be setted to the tensor. + name(string): The name of the tensor. + """ + tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name, + core.VarDesc.VarType.VOCAB, True) + tensor.value().set_vocab(string_dict) + return tensor + + +class FasterTokenizer(nn.Layer): + def __init__(self, vocab_dict): + super(FasterTokenizer, self).__init__() + vocab_tensor = to_map_tensor(vocab_dict, "vocab") + self.register_buffer("vocab", vocab_tensor, persistable=True) + + def forward(self, + text, + text_pair=None, + do_lower_case=True, + max_seq_len=-1, + is_split_into_words=False, + pad_to_max_seq_len=False): + if in_dygraph_mode(): + input_ids, seg_ids = core.ops.faster_tokenizer( + self.vocab, text, text_pair, "do_lower_case", do_lower_case, + "max_seq_len", max_seq_len, "pad_to_max_seq_len", + pad_to_max_seq_len, "is_split_into_words", is_split_into_words) + return input_ids, seg_ids + + attrs = { + "do_lower_case": do_lower_case, + "max_seq_len": max_seq_len, + "pad_to_max_seq_len": pad_to_max_seq_len, + "is_split_into_words": is_split_into_words, + } + helper = LayerHelper("faster_tokenizer") + input_ids = helper.create_variable_for_type_inference(dtype="int64") + seg_ids = helper.create_variable_for_type_inference(dtype="int64") + if text_pair is None: + helper.append_op( + type='faster_tokenizer', + inputs={'Vocab': self.vocab, + 'Text': text}, + outputs={'InputIds': input_ids, + 'SegmentIds': seg_ids}, + attrs=attrs) + else: + helper.append_op( + type='faster_tokenizer', + inputs={ + 'Vocab': self.vocab, + 'Text': text, + 'TextPair': text_pair + }, + outputs={'InputIds': input_ids, + 'SegmentIds': seg_ids}, + attrs=attrs) + return input_ids, seg_ids + + +class Predictor(object): + def __init__(self, model_dir): + model_file = os.path.join(model_dir, "inference.pdmodel") + params_file = os.path.join(model_dir, "inference.pdiparams") + if not os.path.exists(model_file): + raise ValueError("not find model file path {}".format(model_file)) + if not os.path.exists(params_file): + raise ValueError("not find params file path {}".format(params_file)) + config = paddle.inference.Config(model_file, params_file) + + # fast_tokenizer op only support cpu. + config.disable_gpu() + config.set_cpu_math_library_num_threads(10) + + config.switch_use_feed_fetch_ops(False) + self.predictor = paddle.inference.create_predictor(config) + self.input_handles = [ + self.predictor.get_input_handle(name) + for name in self.predictor.get_input_names() + ] + self.output_handles = [ + self.predictor.get_output_handle(name) + for name in self.predictor.get_output_names() + ] + + def predict(self, data): + + self.input_handles[0].copy_from_cpu(data) + self.predictor.run() + input_ids = self.output_handles[0].copy_to_cpu() + token_type_ids = self.output_handles[1].copy_to_cpu() + return input_ids, token_type_ids + + +class TestBertTokenizerOp(unittest.TestCase): + def setUp(self): + self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") + self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab) + self.init_data() + self.save_path = os.path.join(DATA_HOME, "fast_tokenizer") + self.param_path = os.path.join(self.save_path, "model.pdparams") + self.inference_path = os.path.join(self.save_path, "inference") + + def init_data(self): + self.text = [ + '选ꋩē ę±ŸčŠ±å›­ēš„原因就ę˜Æę–¹ä¾æļ¼Œęœ‰ē”µåŠØꉶę¢Æē›“ęŽ„åˆ°č¾¾ęµ·č¾¹ļ¼Œå‘Øå›“é¤é¦†ć€é£Ÿå»Šć€å•†åœŗ态超åø‚ć€ę‘Šä½äø€åŗ”äæ±å…Ø怂' + '酒åŗ—č£…äæ®äø€čˆ¬ļ¼Œä½†čæ˜ē®—ę•“꓁怂 ę³³ę± åœØ大堂ēš„屋锶ļ¼Œå› ę­¤å¾ˆå°ļ¼Œäøčæ‡å„³å„æ倒ę˜Æå–œę¬¢ć€‚ 包ēš„ꗩ餐ę˜Æč„æ式ēš„ļ¼Œ' + 'čæ˜ē®—äø°åÆŒć€‚ ęœåŠ”å—ļ¼Œäø€čˆ¬' + ] + self.text_pair = ['非åøøäøé”™ļ¼ŒęœåŠ”很儽ļ¼Œä½äŗŽåø‚äø­åæƒåŒŗļ¼Œäŗ¤é€šę–¹ä¾æļ¼Œäøčæ‡ä»·ę ¼ä¹Ÿé«˜ļ¼'] + self.text_tensor = to_string_tensor(self.text, "text") + self.text_pair_tensor = to_string_tensor(self.text_pair, "text_pair") + self.texts = [ + '很儽ēš„地ē†ä½ē½®ļ¼Œäø€č¹‹ē³Šę¶‚ēš„ęœåŠ”ļ¼Œč§ę”ēš„é…’åŗ—怂', + ' 选ꋩē ę±ŸčŠ±å›­ēš„原因就ę˜Æę–¹ä¾æļ¼Œęœ‰ē”µåŠØꉶę¢Æē›“ęŽ„åˆ°č¾¾ęµ·č¾¹ļ¼Œå‘Øå›“é¤é¦†ć€é£Ÿå»Šć€å•†åœŗ态超åø‚ć€ę‘Šä½äø€åŗ”äæ±å…Ø怂酒åŗ—č£…äæ®äø€čˆ¬ļ¼Œ' + '但čæ˜ē®—ę•“꓁怂 ę³³ę± åœØ大堂ēš„屋锶ļ¼Œå› ę­¤å¾ˆå°ļ¼Œäøčæ‡å„³å„æ倒ę˜Æå–œę¬¢ć€‚ 包ēš„ꗩ餐ę˜Æč„æ式ēš„ļ¼Œčæ˜ē®—äø°åÆŒć€‚ ęœåŠ”å—ļ¼Œäø€čˆ¬', + 'Test bert tokenizer. The first text.' + ] + self.text_pairs = [ + '非åøøäøé”™ļ¼ŒęœåŠ”很儽ļ¼Œä½äŗŽåø‚äø­åæƒåŒŗļ¼Œäŗ¤é€šę–¹ä¾æļ¼Œäøčæ‡ä»·ę ¼ä¹Ÿé«˜ļ¼', 'ęˆæé—“å¤Ŗå°ć€‚å…¶ä»–ēš„都äø€čˆ¬ć€‚怂怂怂怂怂怂怂怂', + 'Test bert tokenizer. The second text.' + ] + self.texts_tensor = to_string_tensor(self.texts, "texts") + self.text_pairs_tensor = to_string_tensor(self.text_pairs, "text_pairs") + + def test_padding(self): + + self.max_seq_len = 128 + self.pad_to_max_seq_len = True + self.is_split_into_words = False + + # case 1: only one text (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + text=self.text, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 2: only one text and one text_pair (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + text_pair=self.text_pair_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + text=self.text, + text_pair=self.text_pair, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 3: only texts (batch_size = 3) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.texts_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.texts, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = [i["input_ids"] for i in encoded_inputs] + py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs] + py_input_ids = np.array(py_input_ids).reshape([3, -1]) + py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 4: texts and text pairs (batch_size = 3) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.texts_tensor, + text_pair=self.text_pairs_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.texts, + self.text_pairs, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = [i["input_ids"] for i in encoded_inputs] + py_token_type_ids = [i["token_type_ids"] for i in encoded_inputs] + py_input_ids = np.array(py_input_ids).reshape([3, -1]) + py_token_type_ids = np.array(py_token_type_ids).reshape([3, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_no_padding(self): + self.max_seq_len = 128 + self.pad_to_max_seq_len = False + self.is_split_into_words = False + + # case 1: only one text (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + text=self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.text, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + # case 2: only one text and one text_pair (batch_size = 1) + input_ids, token_type_ids = self.faster_tokenizer( + self.text_tensor, + self.text_pair_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + + encoded_inputs = self.bert_tokenizer( + self.text, + self.text_pair, + max_seq_len=self.max_seq_len, + pad_to_max_seq_len=self.pad_to_max_seq_len, + is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_is_split_into_words(self): + self.is_split_into_words = True + + input_ids, token_type_ids = self.faster_tokenizer( + self.text_tensor, + do_lower_case=self.bert_tokenizer.do_lower_case, + is_split_into_words=self.is_split_into_words) + input_ids = input_ids.numpy() + token_type_ids = token_type_ids.numpy() + encoded_inputs = self.bert_tokenizer( + list(self.text[0]), is_split_into_words=self.is_split_into_words) + py_input_ids = np.array(encoded_inputs["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs["token_type_ids"]).reshape( + [1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_inference(self): + if not os.path.exists(self.save_path): + os.makedirs(self.save_path, exist_ok=True) + paddle.save(self.faster_tokenizer.state_dict(), self.param_path) + state_dict = paddle.load(self.param_path) + self.faster_tokenizer.set_dict(state_dict) + + static_model = paddle.jit.to_static( + self.faster_tokenizer, + input_spec=[ + paddle.static.InputSpec( + shape=[None], dtype=core.VarDesc.VarType.STRINGS), # texts + ]) + # Save in static graph model. + paddle.jit.save(static_model, self.inference_path) + predictor = Predictor(self.save_path) + input_ids, token_type_ids = predictor.predict(self.text) + + encoded_inputs = self.bert_tokenizer(self.text) + py_input_ids = np.array(encoded_inputs[0]["input_ids"]).reshape([1, -1]) + py_token_type_ids = np.array(encoded_inputs[0][ + "token_type_ids"]).reshape([1, -1]) + self.assertTrue(np.allclose(input_ids, py_input_ids, rtol=0, atol=0.01)) + self.assertTrue( + np.allclose( + token_type_ids, py_token_type_ids, rtol=0, atol=0.01)) + + def test_feed_string_var(self): + paddle.enable_static() + x = paddle.static.data( + name="x", shape=[-1], dtype=core.VarDesc.VarType.STRINGS) + exe = paddle.static.Executor(paddle.framework.CPUPlace()) + exe.run(paddle.static.default_main_program(), feed={'x': self.text}) + paddle.disable_static() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/tokenizer/__init__.py b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py new file mode 100644 index 0000000000000..b9a7651e44909 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py new file mode 100755 index 0000000000000..00d5f4e772528 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py @@ -0,0 +1,517 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import io +import json +import os +import six +import unicodedata + +from tokenizer_utils import PretrainedTokenizer +from tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation + + +class BasicTokenizer(object): + """ + Runs basic tokenization (punctuation splitting, lower casing, etc.). + Args: + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to `True`. + """ + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer.""" + + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """ + Tokenizes a piece of text using basic tokenizer. + Args: + text (str): A piece of text. + Returns: + list(str): A list of tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BasicTokenizer + basictokenizer = BasicTokenizer() + tokens = basictokenizer.tokenize('He was a puppeteer') + ''' + ['he', 'was', 'a', 'puppeteer'] + ''' + """ + + text = convert_to_unicode(text) + text = self._clean_text(text) + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """ + Strips accents from a piece of text. + """ + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """ + Splits punctuation on a piece of text. + """ + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """ + Adds whitespace around any CJK character. + """ + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """ + Checks whether CP is the codepoint of a CJK character. + """ + + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """ + Performs invalid character removal and whitespace cleanup on text. + """ + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """ + Runs WordPiece tokenization. + Args: + vocab (Vocab|dict): + Vocab of the word piece tokenizer. + unk_token (str): + A specific token to replace all unknown tokens. + max_input_chars_per_word (int): + If a word's length is more than + max_input_chars_per_word, it will be dealt as unknown word. + Defaults to 100. + """ + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + Returns: + list (str): A list of wordpiece tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + vocab = berttokenizer.vocab + unk_token = berttokenizer.unk_token + wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token) + inputs = wordpiecetokenizer.tokenize("unaffable") + print(inputs) + ''' + ["un", "##aff", "##able"] + ''' + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +class BertTokenizer(PretrainedTokenizer): + """ + Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation + splitting, lower casing and so on, and follows a WordPiece tokenizer to + tokenize as subwords. + Args: + vocab_file (str): + The vocabulary file path (ends with '.txt') required to instantiate + a `WordpieceTokenizer`. + do_lower_case (bool): + Whether or not to lowercase the input when tokenizing. + Defaults to`True`. + unk_token (str): + A special token representing the *unknown (out-of-vocabulary)* token. + An unknown token is set to be `unk_token` inorder to be converted to an ID. + Defaults to "[UNK]". + sep_token (str): + A special token separating two different sentences in the same input. + Defaults to "[SEP]". + pad_token (str): + A special token used to make arrays of tokens the same size for batching purposes. + Defaults to "[PAD]". + cls_token (str): + A special token used for sequence classification. It is the last token + of the sequence when built with special tokens. Defaults to "[CLS]". + mask_token (str): + A special token representing a masked token. This is the token used + in the masked language modeling task which the model tries to predict the original unmasked ones. + Defaults to "[MASK]". + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + inputs = berttokenizer.tokenize('He was a puppeteer') + print(inputs) + ''' + {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]} + ''' + """ + resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained + pretrained_resource_files_map = { + "vocab_file": { + "bert-base-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt", + "bert-large-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt", + "bert-base-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt", + "bert-large-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt", + "bert-base-multilingual-uncased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt", + "bert-base-multilingual-cased": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt", + "bert-base-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "bert-wwm-chinese": + "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt", + "bert-wwm-ext-chinese": + "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt", + "macbert-large-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "macbert-base-chinese": + "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt", + "simbert-base-chinese": + "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt", + } + } + pretrained_init_configuration = { + "bert-base-uncased": { + "do_lower_case": True + }, + "bert-large-uncased": { + "do_lower_case": True + }, + "bert-base-cased": { + "do_lower_case": False + }, + "bert-large-cased": { + "do_lower_case": False + }, + "bert-base-multilingual-uncased": { + "do_lower_case": True + }, + "bert-base-multilingual-cased": { + "do_lower_case": False + }, + "bert-base-chinese": { + "do_lower_case": False + }, + "bert-wwm-chinese": { + "do_lower_case": False + }, + "bert-wwm-ext-chinese": { + "do_lower_case": False + }, + "macbert-large-chinese": { + "do_lower_case": False + }, + "macbert-base-chinese": { + "do_lower_case": False + }, + "simbert-base-chinese": { + "do_lower_case": True + }, + } + padding_side = 'right' + + def __init__(self, + vocab_file, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]"): + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the " + "vocabulary from a pretrained model please use " + "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + .format(vocab_file)) + self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token) + self.do_lower_case = do_lower_case + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer( + vocab=self.vocab, unk_token=unk_token) + self.special_tokens_map = { + 'unk_token': unk_token, + 'sep_token': sep_token, + 'pad_token': pad_token, + 'cls_token': cls_token, + 'mask_token': mask_token + } + + @property + def vocab_size(self): + """ + Return the size of vocabulary. + Returns: + int: The size of vocabulary. + """ + + return len(self.vocab) + + def _tokenize(self, text): + """ + End-to-end tokenization for BERT models. + Args: + text (str): The text to be tokenized. + + Returns: + list: A list of string representing converted tokens. + """ + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def tokenize(self, text): + """ + Converts a string to a list of tokens. + Args: + text (str): The text to be tokenized. + + Returns: + List(str): A list of string representing converted tokens. + Examples: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokens = berttokenizer.tokenize('He was a puppeteer') + + ''' + ['he', 'was', 'a', 'puppet', '##eer'] + ''' + """ + + return self._tokenize(text) + + def num_special_tokens_to_add(self, pair=False): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + Args: + pair(bool): + Whether the input is a sequence pair or a single sequence. + Defaults to `False` and the input is a single sequence. + Returns: + int: Number of tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 + if pair else None)) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + + A BERT sequence has the following format: + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + Args: + token_ids_0 (List[int]): + List of IDs to which the special tokens will be added. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. + Returns: + List[int]: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + _cls = [self.cls_token_id] + _sep = [self.sep_token_id] + return _cls + token_ids_0 + _sep + token_ids_1 + _sep + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + :: + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + Args: + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. Defaults to None. + Returns: + List[int]: List of token_type_id according to the given sequence(s). + """ + _sep = [self.sep_token_id] + _cls = [self.cls_token_id] + if token_ids_1 is None: + return len(_cls + token_ids_0 + _sep) * [0] + return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + + _sep) * [1] + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + Args: + token_ids_0 (List[int]): + A list of `inputs_ids` for the first sequence. + token_ids_1 (List[int], optinal): + Optional second list of IDs for sequence pairs. Defaults to None. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + Returns: + List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list( + map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, + token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ( + [0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] diff --git a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py new file mode 100644 index 0000000000000..7da3cd56e25b5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py @@ -0,0 +1,1244 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import io +import json +import os +import unicodedata +from shutil import copyfile +from typing import Iterable, Iterator, Optional, List, Any, Callable, Union + +from paddle.dataset.common import DATA_HOME +from paddle.utils.download import get_path_from_url + + +def convert_to_unicode(text): + """ + Converts `text` to Unicode (if it's not already), assuming utf-8 input. + Args: + text (str|bytes): Text to be converted to unicode. + Returns: + str: converted text. + """ + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + + +def whitespace_tokenize(text): + """ + Runs basic whitespace cleaning and splitting on a peice of text. + Args: + text (str): Text to be tokened. + Returns: + list(str): Token list. + """ + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +def _is_whitespace(char): + """ + Checks whether `chars` is a whitespace character. + """ + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + +def tokenize_chinese_chars(text): + """Adds whitespace around any CJK character.""" + output = [] + buff = "" + for char in text: + cp = ord(char) + if is_chinese_char(cp): + if buff != "": + output.append(buff) + buff = "" + output.append(char) + else: + buff += char + + if buff != "": + output.append(buff) + + return output + + +class PretrainedTokenizer(object): + """ + The base class for all pretrained tokenizers. It mainly provides common methods + for loading (construction and loading) and saving pretrained tokenizers. Loading + and saving also rely on the following class attributes which should be overridden + by derived classes accordingly: + - **tokenizer_config_file** (str): Represents the file name of tokenizer + configuration for configuration saving and loading in local file system. + The value is `tokenizer_config.json`. + - **resource_files_names** (dict): Represents resources to specific file + names mapping for resource saving and loading in local file system. The + keys of dict representing resource items should be argument names in + tokenizer's `__init__` method, and the values are file names for saving + and loading corresponding resources. The mostly used resources here are + vocabulary file and sentence-piece model file. + - **pretrained_init_configuration** (dict): Provides the tokenizer configurations + of built-in pretrained tokenizers (contrasts to tokenizers in local file + system). It has pretrained tokenizer names as keys (the same as pretrained + model names, such as `bert-base-uncased`), and the values are dict preserving + corresponding configuration for tokenizer initialization. + - **pretrained_resource_files_map** (dict): Provides resource URLs of built-in + pretrained tokenizers (contrasts to tokenizers in local file system). It + has the same keys as `resource_files_names`, and the values are also `dict` + mapping specific pretrained tokenizer names (such as `bert-base-uncased`) + to corresponding resource URLs. + Moreover, methods common to tokenizers for tokenization, token/id conversion + and encoding as model inputs are also provided here. + Besides, metaclass `InitTrackerMeta` is used to create `PretrainedTokenizer`, + by which subclasses can track arguments for initialization automatically + and expose special tokens initialization used as attributes. + """ + tokenizer_config_file = "tokenizer_config.json" + pretrained_init_configuration = {} + resource_files_names = {} # keys are arguments of __init__ + pretrained_resource_files_map = {} + padding_side = 'right' + pad_token_type_id = 0 + + def __call__(self, + text, + text_pair=None, + max_seq_len: Optional[int]=None, + stride=0, + is_split_into_words=False, + pad_to_max_seq_len=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports sequence or sequence pair as input, and batch input + is allowed. `self.encode()` or `self.batch_encode()` would be called + separately for single or batch input depending on input format and + `is_split_into_words` argument. + Args: + text (str, List[str] or List[List[str]]): + The sequence or batch of sequences to be processed. One sequence + is a string or a list of strings depending on whether it has been + pretokenized. If each sequence is provided as a list of strings + (pretokenized), you must set `is_split_into_words` as `True` to + disambiguate with a batch of sequences. + text_pair (str, List[str] or List[List[str]], optional): + Same as `text` argument, while it represents for the latter + sequence of the sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + dict or list[dict] (for batch input): + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + - **offset_mapping** (list[int], optional): list of pair preserving the + index of start and end char in original input for each token. + For a special token, the index pair is `(0, 0)`. Included when + `stride` works. + - **overflow_to_sample** (int, optional): Index of example from which this + feature is generated. Included when `stride` works. + """ + # Input type checking for clearer error + assert isinstance(text, str) or ( + isinstance(text, (list, tuple)) and (len(text) == 0 or ( + isinstance(text[0], str) or + (isinstance(text[0], (list, tuple)) and + (len(text[0]) == 0 or isinstance(text[0][0], str))))) + ), ("text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples).") + + assert (text_pair is None or isinstance(text_pair, str) or ( + isinstance(text_pair, (list, tuple)) and (len(text_pair) == 0 or ( + isinstance(text_pair[0], str) or + (isinstance(text_pair[0], (list, tuple)) and + (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))))) + )), ( + "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples).") + + is_batched = bool( + (not is_split_into_words and isinstance(text, (list, tuple))) or + (is_split_into_words and isinstance(text, (list, tuple)) and + text and isinstance(text[0], (list, tuple)))) + + if is_batched: + batch_text_or_text_pairs = list(zip( + text, text_pair)) if text_pair is not None else text + return self.batch_encode( + batch_text_or_text_pairs=batch_text_or_text_pairs, + max_seq_len=max_seq_len, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy="longest_first", + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) + else: + return self.encode( + text=text, + text_pair=text_pair, + max_seq_len=max_seq_len, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy="longest_first", + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) + + @property + def all_special_tokens(self): + """ + list: All the special tokens ('', ''...) corresponding to + special token arguments in `__init__` (arguments end with '_end'). + """ + all_toks = [] + set_attr = self.special_tokens_map + for attr_value in set_attr.values(): + all_toks = all_toks + (list(attr_value) if isinstance(attr_value, ( + list, tuple)) else [attr_value]) + all_toks = list(set(all_toks)) + return all_toks + + @property + def all_special_ids(self): + """ + list: All the token ids corresponding to all the special tokens. + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + def convert_tokens_to_ids(self, tokens): + """ + Converts a sequence of tokens into ids using the `vocab` attribute (an + instance of `Vocab`). Override it if needed. + Argsļ¼š + tokens (list[int]): List of token ids. + Returns: + list: Converted id list. + """ + if isinstance(tokens, list): + token_ids = [] + for token in tokens: + token_id = self.vocab.get(token, self.unk_token_id) + token_ids.append(token_id) + return token_ids + elif isinstance(tokens, str): + token_id = self.vocab.get(tokens, self.unk_token_id) + token_ids.append(token_id) + return token_ids + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): + """ + Creates an instance of `PretrainedTokenizer`. Related resources are loaded + by specifying name of a built-in pretrained model, or a community-contributed + pretrained model, or a local file directory path. + Args: + pretrained_model_name_or_path (str): Name of pretrained model or dir path + to load from. The string can be: + - Name of built-in pretrained model + - Name of a community-contributed pretrained model. + - Local directory path which contains tokenizer related resources + and tokenizer config file ("tokenizer_config.json"). + *args (tuple): position arguments for model `__init__`. If provided, + use these as position argument values for tokenizer initialization. + **kwargs (dict): keyword arguments for model `__init__`. If provided, + use these to update pre-defined keyword argument values for tokenizer + initialization. + Returns: + PretrainedTokenizer: An instance of `PretrainedTokenizer`. + Example: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + # Name of built-in pretrained model + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + # Name of community-contributed pretrained model + tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') + # Load from local directory path + tokenizer = BertTokenizer.from_pretrained('./my_bert/') + """ + pretrained_models = list(cls.pretrained_init_configuration.keys()) + vocab_files = {} + init_configuration = {} + # From built-in pretrained models + if pretrained_model_name_or_path in pretrained_models: + for file_id, map_list in cls.pretrained_resource_files_map.items(): + vocab_files[file_id] = map_list[pretrained_model_name_or_path] + init_configuration = copy.deepcopy( + cls.pretrained_init_configuration[ + pretrained_model_name_or_path]) + # From local dir path + elif os.path.isdir(pretrained_model_name_or_path): + for file_id, file_name in cls.resource_files_names.items(): + full_file_name = os.path.join(pretrained_model_name_or_path, + file_name) + vocab_files[file_id] = full_file_name + vocab_files["tokenizer_config_file"] = os.path.join( + pretrained_model_name_or_path, cls.tokenizer_config_file) + + default_root = os.path.join(DATA_HOME, pretrained_model_name_or_path) + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None or os.path.isfile(file_path): + resolved_vocab_files[file_id] = file_path + continue + path = os.path.join(default_root, file_path.split('/')[-1]) + if os.path.exists(path): + print("Already cached %s" % path) + resolved_vocab_files[file_id] = path + else: + print("Downloading %s and saved to %s" % + (file_path, default_root)) + try: + resolved_vocab_files[file_id] = get_path_from_url( + file_path, default_root) + except RuntimeError as err: + print(err) + raise RuntimeError( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" + f"Please make sure that '{pretrained_model_name_or_path}' is:\n" + "- a correct model-identifier of built-in pretrained models,\n" + "- or a correct model-identifier of community-contributed pretrained models,\n" + "- or the correct path to a directory containing relevant tokenizer files.\n" + ) + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop( + "tokenizer_config_file", None) + if tokenizer_config_file is not None: + with io.open(tokenizer_config_file, encoding="utf-8") as f: + init_kwargs = json.load(f) + else: + init_kwargs = init_configuration + # position args are stored in kwargs, maybe better not include + init_args = init_kwargs.pop("init_args", ()) + init_kwargs.pop("init_class", None) + + # Update with newly provided args and kwargs + init_args = init_args if not args else args + init_kwargs.update(kwargs) + + # Merge resolved_vocab_files arguments in init_kwargs if not including. + # Maybe need more ways to load resources. + for args_name, file_path in resolved_vocab_files.items(): + # when `pretrained_model_name_or_path` is a pretrained model name, + # use pretrained_init_configuration as `init_kwargs` to init which + # does not include the vocab file in it, thus add vocab file into + # args. + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + # when `pretrained_model_name_or_path` is a pretrained model dir, + # use tokenizer_config_file.json as `init_kwargs` to init which + # does include a vocab file path in it. However, if the vocab file + # path included in json does not exist, such as was deleted, to make + # it still work, use the vocab file under this dir. + elif not os.path.isfile(init_kwargs[args_name]) and os.path.isfile( + file_path): + init_kwargs[args_name] = file_path + # TODO(guosheng): avoid reduplication of position args and key word args + tokenizer = cls(*init_args, **init_kwargs) + return tokenizer + + def save_pretrained(self, save_directory): + """ + Save tokenizer configuration and related resources to files under + `save_directory`. The tokenizer configuration would be saved into + `tokenizer_config_file` indicating file (thus `tokenizer_config.json`), + and resources would be saved into `resource_files_names` indicating files + by using `self.save_resources(save_directory)`. + + The `save_directory` can be used in `from_pretrained` as argument value + of `pretrained_model_name_or_path` to re-load the tokenizer. + Args: + save_directory (str): Directory to save files into. + Example: + .. code-block:: + from paddlenlp.transformers import BertTokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer.save_pretrained('trained_model') + # reload from save_directory + tokenizer = BertTokenizer.from_pretrained('trained_model') + """ + assert not os.path.isfile( + save_directory + ), "Saving directory ({}) should be a directory, not a file".format( + save_directory) + os.makedirs(save_directory, exist_ok=True) + + tokenizer_config_file = os.path.join(save_directory, + self.tokenizer_config_file) + # init_config is set in metaclass created `__init__`, + tokenizer_config = self.init_config + with io.open(tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + + self.save_resources(save_directory) + + def save_resources(self, save_directory): + """ + Save tokenizer related resources to `resource_files_names` indicating + files under `save_directory` by copying directly. Override it if necessary. + Args: + save_directory (str): Directory to save files into. + """ + for name, file_name in self.resource_files_names.items(): + src_path = self.init_config[name] + dst_path = os.path.join(save_directory, file_name) + if os.path.abspath(src_path) != os.path.abspath(dst_path): + copyfile(src_path, dst_path) + + @staticmethod + def load_vocabulary(filepath, + unk_token=None, + pad_token=None, + bos_token=None, + eos_token=None, + **kwargs): + """ + Instantiate an instance of `Vocab` from a file reserving all tokens + by using `Vocab.from_dict`. The file contains a token per line, and the + line number would be the index of corresponding token. + Args: + filepath (str): path of file to construct vocabulary. + unk_token (str): special token for unknown token. If no need, it also + could be `None`. Defaults to `None`. + pad_token (str): special token for padding token. If no need, it also + could be `None`. Defaults to `None`. + bos_token (str): special token for bos token. If no need, it also + could be `None`. Defaults to `None`. + eos_token (str): special token for eos token. If no need, it also + could be `None`. Defaults to `None`. + **kwargs (dict): keyword arguments for `Vocab.from_dict`. + Returns: + Vocab: An instance of `Vocab`. + """ + token_to_idx = {} + with io.open(filepath, 'r', encoding='utf-8') as f: + for index, line in enumerate(f): + token = line.rstrip('\n') + token_to_idx[token] = int(index) + return token_to_idx + + def __getattr__(self, name): + if name.endswith('_token'): + return self.special_tokens_map[name] + elif name.endswith('_token_id'): + return self.vocab[self.special_tokens_map[name[:-3]]] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, name)) + + def truncate_sequences(self, + ids, + pair_ids=None, + num_tokens_to_remove=0, + truncation_strategy='longest_first', + stride=0): + """ + Truncates a sequence pair in place to the maximum length. + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): + number of tokens to remove using the truncation strategy + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len) + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if truncation_strategy == 'longest_first': + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == 'only_first': + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == 'only_second': + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == 'do_not_truncate': + raise ValueError( + "Input sequence are too long for max_length. Please select a truncation strategy." + ) + else: + raise ValueError( + "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']" + ) + return (ids, pair_ids, overflowing_tokens) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + Should be overridden in a subclass if the model has a special way of building those. + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + Returns: + List[int]: List of input_id with the appropriate special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + + return token_ids_0 + token_ids_1 + + def build_offset_mapping_with_special_tokens(self, + offset_mapping_0, + offset_mapping_1=None): + """ + Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. + Should be overridden in a subclass if the model has a special way of building those. + Args: + offset_mapping_0 (List[tuple]): + List of char offsets to which the special tokens will be added. + offset_mapping_1 (List[tuple], optional): + Optional second list of char offsets for offset mapping pairs. + Returns: + List[tuple]: List of char offsets with the appropriate offsets of special tokens. + """ + if offset_mapping_1 is None: + return offset_mapping_0 + + return offset_mapping_0 + offset_mapping_1 + + def get_special_tokens_mask(self, + token_ids_0, + token_ids_1=None, + already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``encode`` methods. + Args: + token_ids_0 (List[int]): List of ids of the first sequence. + token_ids_1 (List[int], optional): List of ids of the second sequence. + already_has_special_tokens (bool, optional): Whether or not the token list is already + formatted with special tokens for the model. Defaults to None. + Returns: + results (List[int]): The list of integers in the range [0, 1]: + 1 for a special token, 0 for a sequence token. + """ + return [0] * ((len(token_ids_1) + if token_ids_1 else 0) + len(token_ids_0)) + + def create_token_type_ids_from_sequences(self, + token_ids_0, + token_ids_1=None): + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + Should be overridden in a subclass if the model has a special way of building those. + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + Args: + token_ids_0 (List[int]): + List of IDs. + token_ids_1 (List[int], optional): + Optional second list of IDs for sequence pairs. + Returns: + List[int]: List of token_type_id according to the given sequence(s). + """ + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def num_special_tokens_to_add(self, pair): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + Args: + pair (bool, optional): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. Defaults to `False`. + Returns: + int: Number of special tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len( + self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 + if pair else None)) + + def encode(self, + text, + text_pair=None, + max_seq_len=512, + pad_to_max_seq_len=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports sequence or sequence pair as input, and batch input + is not allowed. + Args: + text (str, List[str] or List[int]): + The sequence to be processed. One sequence is a string, a list + of strings, or a list of integers depending on whether it has + been pretokenized and converted to ids. + text_pair (str, List[str] or List[List[str]]): + Same as `text` argument, while it represents for the latter + sequence of the sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + dict: + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self._tokenize(text) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + ids = get_input_ids(text) + pair_ids = get_input_ids(text_pair) if text_pair is not None else None + + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + encoded_inputs = {} + + # Truncation: Handle max sequence length + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( + pair=pair)) + if max_seq_len and total_len > max_seq_len: + + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_seq_len, + truncation_strategy=truncation_strategy, ) + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len + + # Add special tokens + + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, + pair_ids) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = self.get_special_tokens_mask(ids, + pair_ids) + if return_length: + encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"]) + + # Check lengths + assert max_seq_len is None or len(encoded_inputs[ + "input_ids"]) <= max_seq_len + + # Padding + needs_to_be_padded = pad_to_max_seq_len and \ + max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len + + if needs_to_be_padded: + difference = max_seq_len - len(encoded_inputs["input_ids"]) + if self.padding_side == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[ + "input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + + [self.pad_token_type_id] * difference) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs[ + "input_ids"] + [self.pad_token_id] * difference + elif self.padding_side == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [ + 1 + ] * len(encoded_inputs["input_ids"]) + if return_token_type_ids: + encoded_inputs["token_type_ids"] = ( + [self.pad_token_type_id] * difference + + encoded_inputs["token_type_ids"]) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [ + 1 + ] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [ + self.pad_token_id + ] * difference + encoded_inputs["input_ids"] + else: + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs[ + "input_ids"]) + + if return_position_ids: + encoded_inputs["position_ids"] = list( + range(len(encoded_inputs["input_ids"]))) + + return encoded_inputs + + def batch_encode(self, + batch_text_or_text_pairs, + max_seq_len=512, + pad_to_max_seq_len=False, + stride=0, + is_split_into_words=False, + truncation_strategy="longest_first", + return_position_ids=False, + return_token_type_ids=True, + return_attention_mask=False, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False): + """ + Performs tokenization and uses the tokenized tokens to prepare model + inputs. It supports batch inputs of sequence or sequence pair. + Args: + batch_text_or_text_pairs (list): + The element of list can be sequence or sequence pair, and the + sequence is a string or a list of strings depending on whether + it has been pretokenized. If each sequence is provided as a list + of strings (pretokenized), you must set `is_split_into_words` as + `True` to disambiguate with a sequence pair. + max_seq_len (int, optional): + If set to a number, will limit the total sequence returned so + that it has a maximum length. If there are overflowing tokens, + those overflowing tokens will be added to the returned dictionary + when `return_overflowing_tokens` is `True`. Defaults to `None`. + stride (int, optional): + Only available for batch input of sequence pair and mainly for + question answering usage. When for QA, `text` represents questions + and `text_pair` represents contexts. If `stride` is set to a + positive number, the context will be split into multiple spans + where `stride` defines the number of (tokenized) tokens to skip + from the start of one span to get the next span, thus will produce + a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample' + and 'offset_mapping' preserving the original example and position + information will be added to the returned dictionary. Defaults to 0. + pad_to_max_seq_len (bool, optional): + If set to `True`, the returned sequences would be padded up to + `max_seq_len` specified length according to padding side + (`self.padding_side`) and padding token id. Defaults to `False`. + truncation_strategy (str, optional): + String selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence + until the input is under `max_seq_len` starting from the longest + one at each token (when there is a pair of input sequences). + - 'only_first': Only truncate the first sequence. + - 'only_second': Only truncate the second sequence. + - 'do_not_truncate': Do not truncate (raise an error if the input + sequence is longer than `max_seq_len`). + Defaults to 'longest_first'. + return_position_ids (bool, optional): + Whether to include tokens position ids in the returned dictionary. + Defaults to `False`. + return_token_type_ids (bool, optional): + Whether to include token type ids in the returned dictionary. + Defaults to `True`. + return_attention_mask (bool, optional): + Whether to include the attention mask in the returned dictionary. + Defaults to `False`. + return_length (bool, optional): + Whether to include the length of each encoded inputs in the + returned dictionary. Defaults to `False`. + return_overflowing_tokens (bool, optional): + Whether to include overflowing token information in the returned + dictionary. Defaults to `False`. + return_special_tokens_mask (bool, optional): + Whether to include special tokens mask information in the returned + dictionary. Defaults to `False`. + Returns: + list[dict]: + The dict has the following optional items: + - **input_ids** (list[int]): List of token ids to be fed to a model. + - **position_ids** (list[int], optional): List of token position ids to be + fed to a model. Included when `return_position_ids` is `True` + - **token_type_ids** (list[int], optional): List of token type ids to be + fed to a model. Included when `return_token_type_ids` is `True`. + - **attention_mask** (list[int], optional): List of integers valued 0 or 1, + where 0 specifies paddings and should not be attended to by the + model. Included when `return_attention_mask` is `True`. + - **seq_len** (int, optional): The input_ids length. Included when `return_length` + is `True`. + - **overflowing_tokens** (list[int], optional): List of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **num_truncated_tokens** (int, optional): The number of overflowing tokens. + Included when if `max_seq_len` is specified and `return_overflowing_tokens` + is True. + - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1, + with 0 specifying special added tokens and 1 specifying sequence tokens. + Included when `return_special_tokens_mask` is `True`. + - **offset_mapping** (list[int], optional): list of pair preserving the + index of start and end char in original input for each token. + For a sqecial token, the index pair is `(0, 0)`. Included when + `stride` works. + - **overflow_to_sample** (int, optional): Index of example from which this + feature is generated. Included when `stride` works. + """ + + def get_input_ids(text): + if isinstance(text, str): + tokens = self._tokenize(text) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], str): + return self.convert_tokens_to_ids(text) + elif isinstance(text, + (list, tuple)) and len(text) > 0 and isinstance( + text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + batch_encode_inputs = [] + for example_id, tokens_or_pair_tokens in enumerate( + batch_text_or_text_pairs): + if not isinstance(tokens_or_pair_tokens, (list, tuple)): + text, text_pair = tokens_or_pair_tokens, None + elif is_split_into_words and not isinstance( + tokens_or_pair_tokens[0], (list, tuple)): + text, text_pair = tokens_or_pair_tokens, None + else: + text, text_pair = tokens_or_pair_tokens + + first_ids = get_input_ids(text) + second_ids = get_input_ids( + text_pair) if text_pair is not None else None + + if stride > 0 and second_ids is not None: + + max_len_for_pair = max_seq_len - len( + first_ids) - self.num_special_tokens_to_add(pair=True) + + token_offset_mapping = self.get_offset_mapping(text) + token_pair_offset_mapping = self.get_offset_mapping(text_pair) + + offset = 0 + while offset < len(second_ids): + encoded_inputs = {} + length = len(second_ids) - offset + if length > max_len_for_pair: + length = max_len_for_pair + + ids = first_ids + pair_ids = second_ids[offset:offset + length] + + mapping = token_offset_mapping + pair_mapping = token_pair_offset_mapping[offset:offset + + length] + + offset_mapping = self.build_offset_mapping_with_special_tokens( + mapping, pair_mapping) + sequence = self.build_inputs_with_special_tokens(ids, + pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences( + ids, pair_ids) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = self.get_special_tokens_mask( + ids, pair_ids) + if return_length: + encoded_inputs["seq_len"] = len(encoded_inputs[ + "input_ids"]) + + # Check lengths + assert max_seq_len is None or len(encoded_inputs[ + "input_ids"]) <= max_seq_len + + # Padding + needs_to_be_padded = pad_to_max_seq_len and \ + max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len + + encoded_inputs['offset_mapping'] = offset_mapping + + if needs_to_be_padded: + difference = max_seq_len - len(encoded_inputs[ + "input_ids"]) + if self.padding_side == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len( + encoded_inputs[ + "input_ids"]) + [0] * difference + if return_token_type_ids: + # 0 for padding token mask + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + + [self.pad_token_type_id] * difference) + if return_special_tokens_mask: + encoded_inputs[ + "special_tokens_mask"] = encoded_inputs[ + "special_tokens_mask"] + [1 + ] * difference + encoded_inputs["input_ids"] = encoded_inputs[ + "input_ids"] + [self.pad_token_id] * difference + encoded_inputs['offset_mapping'] = encoded_inputs[ + 'offset_mapping'] + [(0, 0)] * difference + elif self.padding_side == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [ + 0 + ] * difference + [1] * len(encoded_inputs[ + "input_ids"]) + if return_token_type_ids: + # 0 for padding token mask + encoded_inputs["token_type_ids"] = ( + [self.pad_token_type_id] * difference + + encoded_inputs["token_type_ids"]) + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [ + 1 + ] * difference + encoded_inputs[ + "special_tokens_mask"] + encoded_inputs["input_ids"] = [ + self.pad_token_id + ] * difference + encoded_inputs["input_ids"] + encoded_inputs['offset_mapping'] = [ + (0, 0) + ] * difference + encoded_inputs['offset_mapping'] + else: + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len( + encoded_inputs["input_ids"]) + + if return_position_ids: + encoded_inputs["position_ids"] = list( + range(len(encoded_inputs["input_ids"]))) + + encoded_inputs['overflow_to_sample'] = example_id + batch_encode_inputs.append(encoded_inputs) + if offset + length == len(second_ids): + break + offset += min(length, stride) + + else: + batch_encode_inputs.append( + self.encode( + first_ids, + second_ids, + max_seq_len=max_seq_len, + pad_to_max_seq_len=pad_to_max_seq_len, + truncation_strategy=truncation_strategy, + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask)) + + return batch_encode_inputs + + def get_offset_mapping(self, text): + """ + Returns the map of tokens and the start and end index of their start and end character. + Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372 + Args: + text (str): + Input text. + Returns: + list: The offset map of input text. + + """ + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token + if sub_token != self.unk_token else token) + + normalized_text, char_mapping = '', [] + + for i, ch in enumerate(text): + if self.basic_tokenizer.do_lower_case: + ch = ch.lower() + ch = unicodedata.normalize('NFD', ch) + ch = ''.join([c for c in ch if unicodedata.category(c) != 'Mn']) + + ch = ''.join([ + c for c in ch + if not (ord(c) == 0 or ord(c) == 0xfffd or _is_control(c)) + ]) + normalized_text += ch + + char_mapping.extend([i] * len(ch)) + + text, token_mapping, offset = normalized_text, [], 0 + + for token in split_tokens: + if token[:2] == '##': + token = token[2:] + + start = text[offset:].index(token) + offset + end = start + len(token) + + token_mapping.append( + (char_mapping[start], char_mapping[end - 1] + 1)) + offset = end + + return token_mapping diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index 7fdce2af64676..8b72f05f363cb 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -43,7 +43,10 @@ def _build_saved_state_dict(state_dict): name_table = {} for key, value in state_dict.items(): if isinstance(value, (Variable, core.VarBase)): - save_dict[key] = value.numpy() + if value.type == core.VarDesc.VarType.VOCAB: + save_dict[key] = value.value().get_map_tensor() + else: + save_dict[key] = value.numpy() name_table[key] = value.name else: save_dict[key] = value @@ -938,8 +941,9 @@ def load(path, **configs): if "StructuredToParameterName@@" in load_result: for key in load_result["StructuredToParameterName@@"]: - load_result[key] = _ndarray_to_tensor( - load_result[key], config.return_numpy) + if isinstance(load_result[key], np.ndarray): + load_result[key] = _ndarray_to_tensor( + load_result[key], config.return_numpy) if not config.keep_name_table and "StructuredToParameterName@@" in load_result: del load_result["StructuredToParameterName@@"] From fc5db55a39efe1891c6d4baadf27e97536950334 Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 20 Oct 2021 15:59:00 +0800 Subject: [PATCH 047/116] fix fc fuse proble (#36568) --- paddle/fluid/framework/ir/fc_fuse_pass.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 4510aea925e78..bb78cdab67752 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -51,7 +51,12 @@ FCFusePass::FCFusePass() { .IsTensor() .End() .AddAttr("axis") - .IsNumGE(1) + .IsNumMatch([](int axis) -> bool { + if (axis == -1 || axis >= 1) { + return true; + } + return false; + }) .End(); AddOpCompat(OpCompat("relu")) From 6a572a194102a4c01a8b403bb25b86edd72476ff Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 20 Oct 2021 16:01:18 +0800 Subject: [PATCH 048/116] [NPU] Add kldiv_loss_op for npu (#36494) --- paddle/fluid/operators/kldiv_loss_op_npu.cc | 163 ++++++++++++++++++ .../unittests/npu/test_kldiv_loss_op_npu.py | 154 +++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 paddle/fluid/operators/kldiv_loss_op_npu.cc create mode 100644 python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc new file mode 100644 index 0000000000000..7d7cdd4c78671 --- /dev/null +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -0,0 +1,163 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/kldiv_loss_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class KLDivLossNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* target = ctx.Input("Target"); + auto* loss = ctx.Output("Loss"); + auto reduction = ctx.Attr("reduction"); + loss->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + if ("none" == reduction) { + // log(label) + auto ones_tensor = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& ones_runner = + NpuOpRunner("OnesLike", {*target}, {ones_tensor}, {}); + ones_runner.Run(stream); + + auto sub_tensor = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& sub_runner = + NpuOpRunner("Sub", {*target, ones_tensor}, {sub_tensor}, {}); + sub_runner.Run(stream); + + auto log_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& log_runner = + NpuOpRunner("Log1p", {sub_tensor}, {log_target}, {}); + log_runner.Run(stream); + + // log(label) - input + const auto& sub_runner2 = + NpuOpRunner("Sub", {log_target, *input}, {*loss}, {}); + sub_runner2.Run(stream); + + // label * (log(label) - input) + auto min_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + auto max_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + FillNpuTensorWithConstant(&min_value, static_cast(0)); + FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); + + auto cliped_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& clip_runner = NpuOpRunner( + "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); + clip_runner.Run(stream); + + const auto& mul_runner = + NpuOpRunner("Mul", {*loss, cliped_target}, {*loss}, {}); + mul_runner.Run(stream); + } else if ("batchmean" == reduction || "sum" == reduction) { + const auto& runner = NpuOpRunner("KLDiv", {*input, *target}, {*loss}, + {{"reduction", reduction}}); + runner.Run(stream); + } else if ("mean" == reduction) { + const auto& runner = NpuOpRunner("KLDiv", {*input, *target}, {*loss}, + {{"reduction", std::string("sum")}}); + runner.Run(stream); + + const int numel = input->numel(); + const auto& muls_runner = + NpuOpRunner("Muls", {*loss}, {*loss}, + {{"value", static_cast(1.0 / numel)}}); + muls_runner.Run(stream); + } + } +}; + +template +class KLDivLossGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* target = ctx.Input("Target"); + auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto reduction = ctx.Attr("reduction"); + input_grad->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + Tensor loss_grad_transformed; + if ("none" == reduction) { + loss_grad_transformed.ShareDataWith(*loss_grad); + } else { + loss_grad_transformed.mutable_data(input_grad->dims(), ctx.GetPlace()); + + NpuOpRunner broadcast_runner; + broadcast_runner.SetType("BroadcastTo"); + broadcast_runner.AddInput(*loss_grad); + broadcast_runner.AddInput(framework::vectorize(input_grad->dims())); + broadcast_runner.AddOutput(loss_grad_transformed); + broadcast_runner.Run(stream); + } + auto min_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + auto max_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + FillNpuTensorWithConstant(&min_value, static_cast(0)); + FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); + + auto cliped_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& clip_runner = NpuOpRunner( + "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); + clip_runner.Run(stream); + + const auto& mul_runner = NpuOpRunner( + "Mul", {cliped_target, loss_grad_transformed}, {*input_grad}, {}); + mul_runner.Run(stream); + + float k = -1.0f; + + if ("mean" == reduction) { + k = static_cast(-1.0 / input_grad->numel()); + } else if ("batchmean" == reduction) { + k = static_cast(-1.0 / input_grad->dims()[0]); + } + + const auto& muls_runner = + NpuOpRunner("Muls", {*input_grad}, {*input_grad}, {{"value", k}}); + muls_runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(kldiv_loss, ops::KLDivLossNPUKernel, + ops::KLDivLossNPUKernel); + +REGISTER_OP_NPU_KERNEL(kldiv_loss_grad, ops::KLDivLossGradNPUKernel, + ops::KLDivLossGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py new file mode 100644 index 0000000000000..7ed1775fa5e6d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_kldiv_loss_op_npu.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from test_kldiv_loss_op import kldiv_loss + +paddle.enable_static() + + +class TestKLDivLossOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = 'float32' + + def setUp(self): + self.set_npu() + self.init_dtype() + self.initTestCase() + self.op_type = 'kldiv_loss' + x = np.random.uniform(-10, 10, self.x_shape).astype(self.dtype) + target = np.random.uniform(-10, 10, self.x_shape).astype(self.dtype) + + self.attrs = {"reduction": self.reduction} + + self.inputs = { + 'X': x, + 'Target': target, + } + loss = kldiv_loss(x, target, self.reduction) + self.outputs = {'Loss': loss.astype(self.dtype)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], + 'Loss', + no_grad_set=set(["Target"]), + max_relative_error=0.15) + + def initTestCase(self): + self.x_shape = (4, 5, 5) + self.reduction = 'batchmean' + + +class TestKLDivLossOp2(TestKLDivLossOp): + def initTestCase(self): + self.x_shape = (3, 2, 7, 7) + self.reduction = 'none' + + +class TestKLDivLossOp3(TestKLDivLossOp): + def initTestCase(self): + self.x_shape = (2, 3, 5, 7, 9) + self.reduction = 'mean' + + +class TestKLDivLossOp4(TestKLDivLossOp): + def initTestCase(self): + self.x_shape = (5, 20) + self.reduction = 'sum' + + +class TestKLDivLossOp_fp16(TestKLDivLossOp): + def init_dtype(self): + self.dtype = 'float16' + + def test_check_output(self): + self.check_output_with_place(self.place, atol=3e-1) + + def test_check_grad(self): + input_grad = -self.inputs['Target'] * ( + self.inputs['Target'] > 0) / self.inputs['Target'].shape[0] + self.check_grad_with_place( + self.place, ['X'], + 'Loss', + no_grad_set=set(["Target"]), + max_relative_error=0.2, + user_defined_grads=[input_grad]) + + +class TestKLDivLossDygraph(unittest.TestCase): + def run_kl_loss(self, reduction, shape=(5, 20)): + x = np.random.uniform(-10, 10, shape).astype('float32') + target = np.random.uniform(-10, 10, shape).astype('float32') + gt_loss = kldiv_loss(x, target, reduction) + + with paddle.fluid.dygraph.guard(paddle.NPUPlace(0)): + kldiv_criterion = paddle.nn.KLDivLoss(reduction) + pred_loss = kldiv_criterion( + paddle.to_tensor(x), paddle.to_tensor(target)) + self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss)) + + def test_kl_loss_batchmean(self): + self.run_kl_loss('batchmean') + + def test_kl_loss_batchmean_shape(self): + self.run_kl_loss('batchmean', ()) + + def test_kl_loss_mean(self): + self.run_kl_loss('mean') + + def test_kl_loss_sum(self): + self.run_kl_loss('sum') + + def test_kl_loss_none(self): + self.run_kl_loss('none') + + def test_kl_loss_static_api(self): + input = paddle.fluid.data(name='input', shape=[5, 20]) + label = paddle.fluid.data(name='label', shape=[5, 20]) + + pred_loss = paddle.nn.functional.kl_div(input, label) + + +class TestKLDivLossTypePromotion(unittest.TestCase): + def test_kl_div_promotion(self): + with paddle.fluid.dygraph.guard(paddle.NPUPlace(0)): + x1 = paddle.rand([5, 20], dtype='float32') + target1 = paddle.rand([5, 20], dtype='float32') + + kldiv_criterion = paddle.nn.KLDivLoss() + pred_loss1 = kldiv_criterion(x1, target1) + + x2 = paddle.rand([5, 20], dtype='float32') + target2 = paddle.rand([5, 20], dtype='float32') + pred_loss2 = paddle.nn.functional.kl_div(x2, target2) + + +if __name__ == "__main__": + unittest.main() From 17b4dd70a95b9eeec52237c8aa1c6b122b5e93a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com> Date: Wed, 20 Oct 2021 16:13:22 +0800 Subject: [PATCH 049/116] Fix global gather and global scatter operators (#36517) * fix global gather and global scatter operators --- .../collective/global_scatter_op.cu.cc | 8 ++++---- python/paddle/distributed/utils.py | 20 +++++++------------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index 64765b549e5c1..bec984c6b57e1 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -47,8 +47,8 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { if (platform::is_cpu_place(local_count->place())) { cpu_local_count_data = local_count->data(); } else { - framework::TensorCopy(*local_count, platform::CPUPlace(), - &cpu_local_count); + framework::TensorCopySync(*local_count, platform::CPUPlace(), + &cpu_local_count); cpu_local_count_data = cpu_local_count.data(); } auto global_count_len = 0; @@ -57,8 +57,8 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { cpu_global_count_data = global_count->data(); global_count_len = global_count->numel(); } else { - framework::TensorCopy(*global_count, platform::CPUPlace(), - &cpu_global_count); + framework::TensorCopySync(*global_count, platform::CPUPlace(), + &cpu_global_count); cpu_global_count_data = cpu_global_count.data(); global_count_len = cpu_global_count.numel(); } diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 63585e167e8e3..31d5748ce392e 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -65,14 +65,11 @@ def global_scatter(x, to global_count. Args: - x (Tensor): Tensor. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32 or int64. + x (Tensor): Tensor. The tensor data type should be float16, float32, float64, int32 or int64. local_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be sent. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be sent. The tensor data type should be int64. global_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be received. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be received. The tensor data type should be int64. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True. @@ -161,19 +158,16 @@ def global_gather(x, to global_count. Args: - x (Tensor): Tensor. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32 or int64. + x (Tensor): Tensor. Tensor whose data type should be float16, float32, float64, int32 or int64. local_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be received. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be received. Tensor data type should be int64. global_count (Tensor): Tensor which have n_expert * world_size elements that indicates - how many data needed to be sent. Every element in the list must be a Tensor whose - data type should be int64. + how many data needed to be sent. Tensor data type should be int64. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True. Returns: - None. + out (Tensor): The data received from all experts. Examples: .. code-block:: python From 6a3941e3cb9a1752df2374561a4defc7b908fa62 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Wed, 20 Oct 2021 19:46:03 +0800 Subject: [PATCH 050/116] fix bugs of ClipGradByGlobalNorm in HybridParallel (#36555) * fix bugs of ClipGradByGlobalNorm * add unittests * add unittests --- .../hybrid_parallel_optimizer.py | 78 ++++++++++++++----- .../unittests/hybrid_parallel_mp_fp16.py | 59 ++++++++++++++ .../tests/unittests/hybrid_parallel_pp_amp.py | 4 + .../unittests/hybrid_parallel_pp_fp16.py | 4 + .../test_parallel_dygraph_tensor_parallel.py | 3 + 5 files changed, 128 insertions(+), 20 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 6cd875905864b..e7108b3f4f343 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -50,8 +50,11 @@ def __init__(self, clip, hcg): @imperative_base.no_grad def _dygraph_clip(self, params_grads): params_and_grads = [] - sum_square_list_dist = [] - sum_square_list_not_dist = [] + + sum_square_dist_fp16 = [] + sum_square_dist_fp32 = [] + sum_square_not_dist_fp16 = [] + sum_square_not_dist_fp32 = [] for p, g in params_grads: if g is None: @@ -71,20 +74,51 @@ def _dygraph_clip(self, params_grads): if not_shared_enable: if p.is_distributed: - sum_square_list_dist.append(sum_square) + if p.dtype == paddle.float16: + sum_square_dist_fp16.append(sum_square) + elif p.dtype == paddle.float32: + sum_square_dist_fp32.append(sum_square) else: - sum_square_list_not_dist.append(sum_square) - - global_norm_var_dist = layers.concat(sum_square_list_dist) if len( - sum_square_list_dist) != 0 else layers.concat( - [paddle.to_tensor([0.])]) - global_norm_var_dist = layers.reduce_sum(global_norm_var_dist) - - global_norm_var_not_dist = layers.concat( - sum_square_list_not_dist) if len( - sum_square_list_not_dist) != 0 else layers.concat( - [paddle.to_tensor([0.])]) - global_norm_var_not_dist = layers.reduce_sum(global_norm_var_not_dist) + if p.dtype == paddle.float16: + sum_square_not_dist_fp16.append(sum_square) + elif p.dtype == paddle.float32: + sum_square_not_dist_fp32.append(sum_square) + + # global norm of distributed FP16 params_and_grads + if len(sum_square_dist_fp16) == 0: + global_norm_dist_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) + else: + global_norm_dist_fp16 = layers.concat(sum_square_dist_fp16) + global_norm_dist_fp16 = layers.reduce_sum(global_norm_dist_fp16) + global_norm_dist_fp16 = paddle.cast( + global_norm_dist_fp16, dtype=paddle.float32) + + # global norm of non-distributed FP16 params_and_grads + if len(sum_square_not_dist_fp16) == 0: + global_norm_not_dist_fp16 = paddle.to_tensor( + [0.], dtype=paddle.float32) + else: + global_norm_not_dist_fp16 = layers.concat(sum_square_not_dist_fp16) + global_norm_not_dist_fp16 = layers.reduce_sum( + global_norm_not_dist_fp16) + global_norm_not_dist_fp16 = paddle.cast( + global_norm_not_dist_fp16, dtype=paddle.float32) + + # global norm of distributed FP32 params_and_grads + global_norm_dist_fp32 = layers.concat(sum_square_dist_fp32) if len( + sum_square_dist_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_norm_dist_fp32 = layers.reduce_sum(global_norm_dist_fp32) + + # global norm of non-distributed FP32 params_and_grads + global_norm_not_dist_fp32 = layers.concat( + sum_square_not_dist_fp32) if len( + sum_square_not_dist_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_norm_not_dist_fp32 = layers.reduce_sum(global_norm_not_dist_fp32) + + global_norm_var_dist = global_norm_dist_fp16 + global_norm_dist_fp32 + global_norm_var_not_dist = global_norm_not_dist_fp16 + global_norm_not_dist_fp32 # add all reduce to get global norm of distributed params_and_grads if self._hcg.get_model_parallel_world_size() > 1: @@ -105,22 +139,26 @@ def _dygraph_clip(self, params_grads): global_norm_var_not_dist, group=self._hcg.get_sharding_parallel_group()) - global_norm_var = layers.sqrt(global_norm_var_dist + - global_norm_var_not_dist) + global_norm_var_fp32 = layers.sqrt(global_norm_var_dist + + global_norm_var_not_dist) max_global_norm = layers.fill_constant( - shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) + shape=[1], dtype=global_norm_var_fp32.dtype, value=self.clip_norm) clip_var = layers.elementwise_div( x=max_global_norm, y=layers.elementwise_max( - x=global_norm_var, y=max_global_norm)) + x=global_norm_var_fp32, y=max_global_norm)) + clip_var_fp16 = paddle.cast(clip_var, paddle.float16) for p, g in params_grads: if g is None: continue if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue - new_grad = layers.elementwise_mul(x=g, y=clip_var) + if p.dtype == paddle.float16: + new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) + else: + new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) return params_and_grads diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py new file mode 100644 index 0000000000000..3e5eedbec9aea --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_fp16.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import paddle +import numpy as np +from hybrid_parallel_mp_model import TestDistMPTraning +import paddle.distributed.fleet as fleet +import unittest + + +class TestMPFP16(TestDistMPTraning): + def build_optimizer(self, model): + grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) + scheduler = paddle.optimizer.lr.ExponentialDecay( + learning_rate=0.001, gamma=0.999, verbose=True) + optimizer = paddle.optimizer.SGD(scheduler, + grad_clip=grad_clip, + parameters=model.parameters()) + + model, optimizer = paddle.amp.decorate( + models=model, + optimizers=optimizer, + level='O2', + save_dtype='float32') + + return optimizer + + def train_batch(self, batch, model, optimizer, is_mp): + scaler = paddle.amp.GradScaler(init_loss_scaling=5160) + if is_mp: + scaler = fleet.distributed_scaler(scaler) + with paddle.amp.auto_cast(enable=True, level="O2"): + output = model(batch) + loss = output.mean() + + scaled = scaler.scale(loss) + scaled.backward() + scaler.step(optimizer) + scaler.update() + optimizer.clear_grad() + return scaled + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py index 33a04a5e7e183..84d11670027fe 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py @@ -61,11 +61,14 @@ def test_pp_model(self): rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) + grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) + #construct model a model_a = AlexNet(10) scheduler_a = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, + grad_clip=grad_clip, parameters=model_a.parameters()) scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5) @@ -80,6 +83,7 @@ def test_pp_model(self): scheduler_b = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, + grad_clip=grad_clip, parameters=model_b.parameters()) model_b = fleet.distributed_model(model_b) optimizer_b = fleet.distributed_optimizer(optimizer_b) diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py index 571459365addf..9042cdba97675 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py @@ -61,11 +61,14 @@ def test_pp_model(self): rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) + grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) + #construct model a model_a = AlexNet(10) scheduler_a = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, + grad_clip=grad_clip, parameters=model_a.parameters()) scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5) @@ -75,6 +78,7 @@ def test_pp_model(self): scheduler_b = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, + grad_clip=grad_clip, parameters=model_b.parameters()) param_len = len(model_a.parameters()) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py index 4b9d6764bbb3b..3705deb5ad856 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py @@ -30,6 +30,9 @@ def test_hybrid_parallel_mp_model(self): def test_hybrid_parallel_mp_amp(self): self.run_mnist_2gpu('hybrid_parallel_mp_amp.py') + def test_hybrid_parallel_mp_fp16(self): + self.run_mnist_2gpu('hybrid_parallel_mp_fp16.py') + def test_hybrid_parallel_mp_clip_grad(self): self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py') From ded3e705ef34e5660de17d8aeb7ded3818abb63b Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Wed, 20 Oct 2021 20:21:19 +0800 Subject: [PATCH 051/116] [heterps]fix heterps pipeline training (#36512) * split into PreBuildTask and BuildPull; slove endpass bug;test=develop * change buildcpu into prebuild and buildcpu into build;test=develop --- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 54 ++++++++++++------- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 19 +++---- 2 files changed, 45 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index d3990c1f3dd76..4fb98e526d5fc 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -40,7 +40,7 @@ namespace framework { std::shared_ptr PSGPUWrapper::s_instance_ = NULL; bool PSGPUWrapper::is_initialized_ = false; -void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { +void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin"; platform::Timer timeline; timeline.Start(); @@ -49,17 +49,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { auto& local_keys = gpu_task->feature_keys_; auto& local_ptr = gpu_task->value_ptr_; - auto& device_keys = gpu_task->device_keys_; - auto& device_vals = gpu_task->device_values_; - auto& device_mutex = gpu_task->mutex_; - std::vector threads; -#ifdef PADDLE_WITH_PSLIB - auto fleet_ptr = FleetWrapper::GetInstance(); -#endif -#ifdef PADDLE_WITH_PSCORE - auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); -#endif // data should be in input channel thread_keys_.resize(thread_keys_thread_num_); @@ -181,6 +171,25 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { VLOG(3) << "GpuPs shard: " << i << " key len: " << local_keys[i].size(); local_ptr[i].resize(local_keys[i].size()); } +} + +void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { + platform::Timer timeline; + int device_num = heter_devices_.size(); + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; + + auto& device_keys = gpu_task->device_keys_; + auto& device_vals = gpu_task->device_values_; + auto& device_mutex = gpu_task->mutex_; + + std::vector threads(thread_keys_shard_num_); +#ifdef PADDLE_WITH_PSLIB + auto fleet_ptr = FleetWrapper::GetInstance(); +#endif +#ifdef PADDLE_WITH_PSCORE + auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); +#endif #ifdef PADDLE_WITH_PSLIB // get day_id: day nums from 1970 @@ -482,29 +491,32 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { void PSGPUWrapper::start_build_thread() { running_ = true; VLOG(3) << "start build CPU&GPU ps thread."; - build_cpu_threads_ = std::thread([this] { build_cpu_thread(); }); - build_gpu_threads_ = std::thread([this] { build_gpu_thread(); }); + pre_build_threads_ = std::thread([this] { pre_build_thread(); }); + build_threads_ = std::thread([this] { build_thread(); }); } -void PSGPUWrapper::build_cpu_thread() { +void PSGPUWrapper::pre_build_thread() { + // prebuild: process load_data while (running_) { std::shared_ptr gpu_task = nullptr; if (!data_ready_channel_->Get(gpu_task)) { continue; } - VLOG(3) << "thread BuildTask start."; + VLOG(3) << "thread PreBuildTask start."; platform::Timer timer; timer.Start(); // build cpu ps data process - BuildTask(gpu_task); + PreBuildTask(gpu_task); timer.Pause(); - VLOG(1) << "thread BuildTask end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(1) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() + << "s"; buildcpu_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; } -void PSGPUWrapper::build_gpu_thread() { +void PSGPUWrapper::build_thread() { + // build: build_pull + build_gputask while (running_) { std::shared_ptr gpu_task = nullptr; if (!gpu_free_channel_->Get(gpu_task)) { @@ -516,12 +528,14 @@ void PSGPUWrapper::build_gpu_thread() { VLOG(3) << "thread BuildGPUTask start."; platform::Timer timer; timer.Start(); + BuildPull(gpu_task); + timer.Pause(); + timer.Start(); BuildGPUTask(gpu_task); timer.Pause(); VLOG(1) << "thread BuildGPUTask end, cost time: " << timer.ElapsedSec() << "s"; - gpu_task_pool_.Push(gpu_task); train_ready_channel_->Put(gpu_task); } VLOG(3) << "build gpu thread end"; @@ -557,6 +571,8 @@ void PSGPUWrapper::EndPass() { if (keysize_max != 0) { HeterPs_->end_pass(); } + + gpu_task_pool_.Push(current_task_); current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 6f785cad33e2d..c1f83d2fe9274 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -84,13 +84,14 @@ class PSGPUWrapper { const int batch_size); void BuildGPUTask(std::shared_ptr gpu_task); - void BuildTask(std::shared_ptr gpu_task); + void PreBuildTask(std::shared_ptr gpu_task); + void BuildPull(std::shared_ptr gpu_task); void LoadIntoMemory(bool is_shuffle); void BeginPass(); void EndPass(); void start_build_thread(); - void build_cpu_thread(); - void build_gpu_thread(); + void pre_build_thread(); + void build_thread(); void Finalize() { VLOG(3) << "PSGPUWrapper Begin Finalize."; @@ -102,10 +103,10 @@ class PSGPUWrapper { gpu_free_channel_->Close(); train_ready_channel_->Close(); running_ = false; - VLOG(3) << "begin stop build_cpu_threads_"; - build_cpu_threads_.join(); - VLOG(3) << "begin stop build_gpu_threads_"; - build_gpu_threads_.join(); + VLOG(3) << "begin stop pre_build_threads_"; + pre_build_threads_.join(); + VLOG(3) << "begin stop build_threads_"; + build_threads_.join(); s_instance_ = nullptr; VLOG(3) << "PSGPUWrapper Finalize Finished."; } @@ -310,8 +311,8 @@ class PSGPUWrapper { train_ready_channel_ = paddle::framework::MakeChannel>(); std::shared_ptr current_task_ = nullptr; - std::thread build_cpu_threads_; - std::thread build_gpu_threads_; + std::thread pre_build_threads_; + std::thread build_threads_; bool running_ = false; protected: From e82c3a5f6da3348845a65670d412d5607c7b9c14 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 21 Oct 2021 10:10:49 +0800 Subject: [PATCH 052/116] Support No DataTransform From GetKernelTypeForVar (#36571) * Add kQueueSync.synchronize_run_ logic * Support No DataTransform From GetKernelTypeForVar --- .../fluid/framework/new_executor/interpretercore.cc | 2 ++ .../framework/new_executor/interpretercore_util.cc | 12 ++++++++++-- .../fluid/framework/new_executor/new_executor_defs.h | 3 +++ .../fluid/framework/new_executor/stream_analyzer.cc | 3 ++- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index f6157367cd4e2..b26d213ddf774 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -118,6 +118,8 @@ void InterpreterCore::Convert() { temp_inst.input_index_ = vec_func_list_[i].input_index; temp_inst.output_index_ = vec_func_list_[i].output_index; temp_inst.type_ = vec_func_list_[i].type_; + temp_inst.no_data_transform_index_ = + vec_func_list_[i].no_data_transform_index; OpInOutInfo info; diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 3438fc3bd4dcd..7bb0429c6228b 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -278,6 +278,7 @@ void build_op_func_list(const platform::Place& place, // step 3. Insert memcpy_op if needed VariableValueMap& ins_map_temp = runtime_context.inputs; + std::unordered_set no_data_transform_index; for (auto& var_name_item : ins_map_temp) { for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto var = var_name_item.second[i]; @@ -289,8 +290,14 @@ void build_op_func_list(const platform::Place& place, static_cast(op_base) ->GetKernelTypeForVar(var_name_item.first, *tensor_in, expected_kernel_key); - if (!platform::is_same_place(kernel_type_for_var.place_, - expected_kernel_key.place_)) { + if (platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_key.place_)) { + // record no need data transformer input var_id + auto& var_name = inputs_names[var_name_item.first][i]; + VLOG(3) << op->Type() << " found no data_transform var: " << var_name + << " with id: " << var_scope->name2id[var_name]; + no_data_transform_index.emplace(var_scope->name2id[var_name]); + } else { if (op_base->Type() == "fetch_v2") { op_base->SetAttr("deepcopy", false); } @@ -385,6 +392,7 @@ void build_op_func_list(const platform::Place& place, } } } + op_func_node.no_data_transform_index = std::move(no_data_transform_index); // step 4. Run op kernel op_list->push_back(op_base); VLOG(3) << op_base->Type() diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 19b7b6d5dc299..e6cff353a659d 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -511,6 +511,8 @@ struct Instruction { std::map> input_index_; std::map> output_index_; + std::unordered_set no_data_transform_index_; + std::vector gc_check_var_list; NextInstruction next_instruction_; @@ -527,6 +529,7 @@ struct OpFuncNode { // int unsed; std::map> input_index; std::map> output_index; + std::unordered_set no_data_transform_index; OpKernelComputeFunc kernel_func_; platform::DeviceContext* dev_ctx_; // not owned diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index a9322d8fc88ed..ffc2da499e1f7 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -38,7 +38,8 @@ std::vector StreamAnalyzer::ParseEventVarIds( std::vector new_event_var_ids; for (auto& item : next_instr.input_index_) { for (auto var_id : item.second) { - if (unique_var_ids.count(var_id) > 0) { + if (unique_var_ids.count(var_id) > 0 && + next_instr.no_data_transform_index_.count(var_id) == 0) { new_event_var_ids.push_back(var_id); } } From 1d38a01347cc7017ba65d93a3283fd7eaa415e2a Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:20:41 +0800 Subject: [PATCH 053/116] refine comments for GradScaler state_dict (#36522) --- python/paddle/amp/grad_scaler.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index 83f57fc74e89a..ca08ce196a983 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -579,11 +579,15 @@ def state_dict(self): Reurns: A dict of scaler includes: - init_loss_scaling (float, optional): The initial loss scaling factor. - incr_ratio(float, optional): The multiplier to use when increasing the loss scaling. - decr_ratio(float, optional): The less-than-one-multiplier to use when decreasing the loss scaling. - incr_every_n_steps(int, optional): Increases loss scaling every n consecutive steps with finite gradients. - decr_every_n_nan_or_inf(int, optional): Decreases loss scaling every n accumulated steps with nan or inf gradients. + scale (tensor): The loss scaling factor. + incr_ratio(float): The multiplier to use when increasing the loss scaling. + decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. + incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. + decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. + incr_count(int): The number of recent consecutive unskipped steps. + decr_count(int): The number of recent consecutive skipped steps. + use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True. + Examples: From f69857749a34755de641444aab324e483eff79a0 Mon Sep 17 00:00:00 2001 From: YipZLF <22539457+YipZLF@users.noreply.github.com> Date: Thu, 21 Oct 2021 10:41:56 +0800 Subject: [PATCH 054/116] Fixed unit test for auto parallel cost model (#36574) --- .../test_auto_parallel_cost_model.py | 53 +++++++++---------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py index 58d033ad65831..000b1db61381e 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py @@ -16,6 +16,7 @@ import unittest +import copy import paddle import paddle.nn as nn import paddle.static as static @@ -141,28 +142,24 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): loss, train_program, startup_program = mlp_forward(train_program, startup_program) + dist_strategy = fleet.DistributedStrategy() + # auto completion complete_train_program = auto.complete_annotation(train_program, dist_context) + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) - dist_strategy = fleet.DistributedStrategy() - dist_main_prog = [] - dist_startup_prog = [] - for rank_id in range(NUM_RANKS): - partitioner = Partitioner(dist_strategy, dist_context, rank_id) - # logical partition - auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( - complete_train_program, startup_program) - dist_params_grads = partitioner.apply_backward( - loss, complete_train_program, startup_program, - auto_parallel_main_prog, auto_parallel_startup_prog) - optimizer = paddle.fluid.optimizer.AdamOptimizer() - opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, - auto_parallel_main_prog, - auto_parallel_startup_prog) - dist_main_prog.append(auto_parallel_main_prog) - dist_startup_prog.append(auto_parallel_startup_prog) - return dist_main_prog, dist_startup_prog + return auto_parallel_main_prog, auto_parallel_startup_prog def check_runtime_estimation(cost): @@ -210,20 +207,20 @@ def test_empty_program_cost_model(self): self.assertTrue(check_empty_program_memory(cost)) def test_auto_parallel_cost_model(self): - train_program = paddle.static.Program() - startup_program = paddle.static.Program() - dist_context = DistributedContext() standalone_cost_data = get_single_node_data() - distributed_program, dist_startup_prog = get_dist_prog( - train_program, startup_program, dist_context, 0) + dist_program = [] for rank_id in range(NUM_RANKS): - complete_backward_annotation(distributed_program[rank_id], - dist_context) - reshard(distributed_program[rank_id], dist_startup_prog[rank_id], - rank_id, dist_context) + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + distributed_program, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + reshard(distributed_program, dist_startup_prog, rank_id, + dist_context) + dist_program.append(distributed_program) cluster = None cost = estimate_cost( - distributed_program, + dist_program, cluster=cluster, pipeline_config=pp_cfg, standalone_cost_data=standalone_cost_data, From 72533986d9c0885720c3793b2e4ed5e02cca39cd Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Thu, 21 Oct 2021 11:07:43 +0800 Subject: [PATCH 055/116] Fix flame graph (#36578) * add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. * adjust multithread using, fix flame graph * update --- .../framework/new_executor/interpretercore.cc | 35 +++++++++++-------- .../framework/new_executor/interpretercore.h | 3 +- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index b26d213ddf774..7e16c3619d61c 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -376,7 +376,8 @@ void InterpreterCore::ExecuteInstructionList( vec_instr.size(), op_run_number_.load())); } -void InterpreterCore::RunNextInstruction(const Instruction& instr) { +void InterpreterCore::RunNextInstructions( + const Instruction& instr, std::queue* reserved_next_ops) { auto& next_instr = instr.next_instruction_; auto& atomic_deps = async_work_queue_.AtomicDeps(); auto IsReady = [&](size_t next_id) { @@ -395,12 +396,12 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { // keep all async_ops running in current thread for (auto next_id : next_instr.direct_run_) { if (IsReady(next_id)) { - RunInstructionAsync(next_id); + reserved_next_ops->push(next_id); } } for (auto next_id : next_instr.event_wait_run_) { if (IsReady(next_id)) { - RunInstructionAsync(next_id); + reserved_next_ops->push(next_id); } } } else { @@ -428,25 +429,31 @@ void InterpreterCore::RunNextInstruction(const Instruction& instr) { [&, next_id] { RunInstructionAsync(next_id); }); } } - if (first_op != 0) RunInstructionAsync(first_op); + if (first_op != 0) reserved_next_ops->push(first_op); } } void InterpreterCore::RunInstructionAsync(size_t instr_id) { - auto& instr_node = vec_instruction_[instr_id]; - platform::RecordEvent instruction_event( - instr_node.kernel_func_.operator_base_->Type()); - event_manager_.WaitEvent(instr_node, place_); + std::queue ready_ops; + ready_ops.push(instr_id); + while (!ready_ops.empty()) { + instr_id = ready_ops.front(); + ready_ops.pop(); + auto& instr_node = vec_instruction_[instr_id]; + platform::RecordEvent instruction_event( + instr_node.kernel_func_.operator_base_->Type()); + event_manager_.WaitEvent(instr_node, place_); - RunInstruction(instr_node); + RunInstruction(instr_node); - event_manager_.RecordEvent(instr_node, place_); - op_run_number_.fetch_add(1, std::memory_order_relaxed); + event_manager_.RecordEvent(instr_node, place_); + op_run_number_.fetch_add(1, std::memory_order_relaxed); - // GC infomation - CheckGC(instr_id, instr_node.gc_check_var_list); + // GC infomation + CheckGC(instr_id, instr_node.gc_check_var_list); - RunNextInstruction(instr_node); + RunNextInstructions(instr_node, &ready_ops); + } } void InterpreterCore::CheckGC(size_t instr_id, diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 47f23aff4f00e..d6c916b9ddc4c 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -68,7 +68,8 @@ class InterpreterCore { void CheckGC(size_t instr_id, const std::vector& gc_check_list); void RunInstructionAsync(size_t instr_id); - void RunNextInstruction(const Instruction& instr_id); + void RunNextInstructions(const Instruction& instr_id, + std::queue* reserved_next_ops); void AddFetch(const std::vector& fetch_names); void BuildSkipShareLoDInfo(); From d64f7b3bda82cba9b8cd77573fda6a0be1a83887 Mon Sep 17 00:00:00 2001 From: zhaocaibei123 <48509226+zhaocaibei123@users.noreply.github.com> Date: Thu, 21 Oct 2021 11:18:25 +0800 Subject: [PATCH 056/116] add ctr table depends (#36465) * add ctr table depends * code style * fix * fix * fix naming * rename * rename --- .../fluid/distributed/common/local_random.h | 65 +++++ paddle/fluid/distributed/ps.proto | 68 +++++ paddle/fluid/distributed/table/CMakeLists.txt | 6 +- .../distributed/table/depends/feature_value.h | 167 ++++++++++++ .../distributed/table/depends/sparse_utils.h | 5 +- .../distributed/table/sparse_sgd_rule.cc | 243 ++++++++++++++++++ .../fluid/distributed/table/sparse_sgd_rule.h | 134 ++++++++++ paddle/fluid/distributed/test/CMakeLists.txt | 6 + .../distributed/test/feature_value_test.cc | 55 ++++ .../distributed/test/sparse_sgd_rule_test.cc | 191 ++++++++++++++ 10 files changed, 937 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/distributed/common/local_random.h create mode 100644 paddle/fluid/distributed/table/depends/feature_value.h create mode 100644 paddle/fluid/distributed/table/sparse_sgd_rule.cc create mode 100644 paddle/fluid/distributed/table/sparse_sgd_rule.h create mode 100644 paddle/fluid/distributed/test/feature_value_test.cc create mode 100644 paddle/fluid/distributed/test/sparse_sgd_rule_test.cc diff --git a/paddle/fluid/distributed/common/local_random.h b/paddle/fluid/distributed/common/local_random.h new file mode 100644 index 0000000000000..96b8d2d21a560 --- /dev/null +++ b/paddle/fluid/distributed/common/local_random.h @@ -0,0 +1,65 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +namespace paddle { +namespace distributed { + +// Get time in seconds. +inline double current_realtime() { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); // NOLINT + std::seed_seq sseq = { + x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; // NOLINT + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + +template +std::uniform_real_distribution& local_uniform_real_distribution() { + thread_local std::uniform_real_distribution distr; + assert(distr.a() == 0.0 && distr.b() == 1.0); + return distr; +} + +template +T uniform_real() { + return local_uniform_real_distribution()(local_random_engine()); +} + +template +T uniform_real(T a, T b) { + if (a == b) { + return a; + } + return (T)(a + uniform_real() * (b - a)); +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index 862ae4a504d9b..002be15b003eb 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -119,10 +119,41 @@ message TableParameter { message TableAccessorParameter { optional string accessor_class = 1; + // optional SparseSGDRuleParameter sparse_sgd_param = 2; optional uint32 fea_dim = 4 [ default = 11 ]; optional uint32 embedx_dim = 5 [ default = 8 ]; optional uint32 embedx_threshold = 6 [ default = 10 ]; + optional CtrAccessorParameter ctr_accessor_param = 7; repeated TableAccessorSaveParameter table_accessor_save_param = 8; + // optional SparseCommonSGDRuleParameter sparse_commonsgd_param = 9; + optional SparseCommonSGDRuleParameter embed_sgd_param = 10; + optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; +} + +message CtrAccessorParameter { + optional float nonclk_coeff = 1 + [ default = 0.1 ]; // to calculate show_click_score + optional float click_coeff = 2 + [ default = 1 ]; // to calculate show_click_score + optional float base_threshold = 3 [ + default = 1.5 + ]; // show_click_score > base_threshold, this feature can be saved + optional float delta_threshold = 4 + [ default = + 0.25 ]; // delta_score > delta_threshold, this feature can be saved + optional float delta_keep_days = 5 + [ default = + 16 ]; // unseen_day < delta_keep_days, this feature can be saved + optional float show_click_decay_rate = 6 [ + default = 0.98 + ]; // show/click will update to show/click * show_click_decay_rate after a day + optional float delete_threshold = 7 + [ default = 0.8 ]; // threshold to shrink a feasign + optional float delete_after_unseen_days = 8 + [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature + // will be delete in shrink_model + optional int32 ssd_unseenday_threshold = 9 + [ default = 1 ]; // threshold to save ssd } message TensorAccessorParameter { @@ -150,3 +181,40 @@ message TableAccessorSaveParameter { optional string converter = 2; optional string deconverter = 3; } + +// message SparseSGDRuleParameter { +// optional double learning_rate = 1 [default = 0.05]; +// optional double initial_g2sum = 2 [default = 3.0]; +// optional double initial_range = 3 [default = 0.0001]; +// repeated float weight_bounds = 4; +//} + +message SparseCommonSGDRuleParameter { + optional string name = 1; + optional SparseNaiveSGDRuleParameter naive = 2; + optional SparseAdagradSGDRuleParameter adagrad = 3; + optional SparseAdamSGDParameter adam = 4; +} + +message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + repeated float weight_bounds = 3; +} + +message + SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_g2sum = 2 [ default = 3.0 ]; + optional double initial_range = 3 [ default = 0.0001 ]; + repeated float weight_bounds = 4; +} + +message SparseAdamSGDParameter { // SparseAdamSGDRule + optional double learning_rate = 1 [ default = 0.001 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + optional double beta1_decay_rate = 3 [ default = 0.9 ]; + optional double beta2_decay_rate = 4 [ default = 0.999 ]; + optional double ada_epsilon = 5 [ default = 1e-08 ]; + repeated float weight_bounds = 6; +} diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index c928ebe90ceb9..b4b87e652b7db 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -35,4 +35,8 @@ cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_ cc_library(tensor_table SRCS tensor_table.cc DEPS eigen3 ps_framework_proto executor scope device_context tensor ${TABLE_DEPS}) set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost) +set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto) + + +cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost sparse_sgd_rule) diff --git a/paddle/fluid/distributed/table/depends/feature_value.h b/paddle/fluid/distributed/table/depends/feature_value.h new file mode 100644 index 0000000000000..ad037a86bce80 --- /dev/null +++ b/paddle/fluid/distributed/table/depends/feature_value.h @@ -0,0 +1,167 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include "gflags/gflags.h" + +#include "butil/object_pool.h" +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/distributed/table/depends/initializers.h" +#include "paddle/fluid/distributed/thirdparty/round_robin.h" +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace distributed { + +static const int CTR_SPARSE_SHARD_BUCKET_NUM_BITS = 6; +static const size_t CTR_SPARSE_SHARD_BUCKET_NUM = + static_cast(1) << CTR_SPARSE_SHARD_BUCKET_NUM_BITS; + +class FixedFeatureValue { + public: + FixedFeatureValue() {} + ~FixedFeatureValue() {} + float *data() { return data_.data(); } + size_t size() { return data_.size(); } + void resize(size_t size) { data_.resize(size); } + void shrink_to_fit() { data_.shrink_to_fit(); } + + private: + std::vector data_; +}; + +class SparseTableShard { + public: + typedef typename robin_hood::unordered_map + map_type; + SparseTableShard() {} + ~SparseTableShard() {} + + FixedFeatureValue *Init(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + FixedFeatureValue *value = nullptr; + value = butil::get_object(); + table[id] = value; + return value; + } + + // dont judge if (has(id)) + float *Get(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + // auto &value = table.at(id); + // return value->data_.data(); + auto res = table.find(id); + FixedFeatureValue *value = res->second; + return value->data(); + } + + // for load, to reset count, unseen_days + FixedFeatureValue *GetValue(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); + return res->second; + } + + void erase(uint64_t feasign) { + size_t hash = hasher_(feasign); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto iter = table.find(feasign); + if (iter != table.end()) { + butil::return_object(iter->second); + iter = table.erase(iter); + } + } + + void clear() {} + + size_t compute_bucket(size_t hash) { + if (CTR_SPARSE_SHARD_BUCKET_NUM == 1) { + return 0; + } else { + return hash >> (sizeof(size_t) * 8 - CTR_SPARSE_SHARD_BUCKET_NUM_BITS); + } + } + + map_type::iterator end() { + return values_[CTR_SPARSE_SHARD_BUCKET_NUM - 1].end(); + } + + map_type::iterator Find(uint64_t id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { + return end(); + } else { + return got; + } + } + + private: + bool Has(const uint64_t id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { + return false; + } else { + return true; + } + } + + public: + map_type values_[CTR_SPARSE_SHARD_BUCKET_NUM]; + std::hash hasher_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h index c185dd17d792e..708f7786bf3b0 100644 --- a/paddle/fluid/distributed/table/depends/sparse_utils.h +++ b/paddle/fluid/distributed/table/depends/sparse_utils.h @@ -31,8 +31,9 @@ struct PullSparseValue { feasigns_(nullptr), frequencies_(nullptr) {} - explicit PullSparseValue(std::vector feasigns, - std::vector frequencies, int dim) { + explicit PullSparseValue(std::vector& feasigns, // NOLINT + std::vector& frequencies, // NOLINT + int dim) { numel_ = feasigns.size(); dim_ = dim; is_training_ = true; diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/table/sparse_sgd_rule.cc new file mode 100644 index 0000000000000..614656a5a85d3 --- /dev/null +++ b/paddle/fluid/distributed/table/sparse_sgd_rule.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include +#include "glog/logging.h" + +DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient"); + +namespace paddle { +namespace distributed { + +void SparseNaiveSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto naive_param = param.naive(); + learning_rate_ = naive_param.learning_rate(); + _initial_range = naive_param.initial_range(); + if (naive_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(naive_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << naive_param.weight_bounds_size(); + _min_bound = naive_param.weight_bounds(0); + _max_bound = naive_param.weight_bounds(1); + } +} + +void SparseNaiveSGDRule::update_value_work(float* w, float* sgd, + const float* push_value, + float scale) { + for (size_t i = 0; i < _embedding_dim; ++i) { + w[i] -= learning_rate_ * push_value[i]; + bound_value(w[i]); + } +} + +void SparseNaiveSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + if (zero_init) { + for (size_t i = 0; i < _embedding_dim; ++i) { + value[i] = 0; + } + } else { + for (size_t i = 0; i < _embedding_dim; ++i) { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } +} +void SparseAdaGradSGDRule::load_config( + const SparseCommonSGDRuleParameter& param, size_t emb_dim) { + _embedding_dim = emb_dim; + auto adagrad_param = param.adagrad(); + learning_rate_ = adagrad_param.learning_rate(); + _initial_g2sum = adagrad_param.initial_g2sum(); + _initial_range = adagrad_param.initial_range(); + + if (adagrad_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adagrad_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adagrad_param.weight_bounds_size(); + _min_bound = adagrad_param.weight_bounds(0); + _max_bound = adagrad_param.weight_bounds(1); + } +} + +void SparseAdaGradSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + float& g2sum = sgd[g2sum_index()]; + double add_g2sum = 0; + + for (int i = 0; i < _embedding_dim; i++) { + double scaled_grad = grad[i] / scale; + w[i] -= learning_rate_ * scaled_grad * + sqrt(_initial_g2sum / (_initial_g2sum + g2sum)); + bound_value(w[i]); + add_g2sum += scaled_grad * scaled_grad; + } + + g2sum += add_g2sum / _embedding_dim; +} + +void SparseAdaGradSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } + sgd[g2sum_index()] = 0; +} + +void StdAdaGradSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto adagrad_param = param.adagrad(); + learning_rate_ = adagrad_param.learning_rate(); + _initial_g2sum = adagrad_param.initial_g2sum(); + _initial_range = adagrad_param.initial_range(); + + if (adagrad_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adagrad_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adagrad_param.weight_bounds_size(); + _min_bound = adagrad_param.weight_bounds(0); + _max_bound = adagrad_param.weight_bounds(1); + } +} + +void StdAdaGradSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + for (int i = 0; i < _embedding_dim; i++) { + float& g2sum = sgd[g2sum_index() + i]; + double scaled_grad = grad[i] / scale; + w[i] -= learning_rate_ * scaled_grad * + sqrt(_initial_g2sum / (_initial_g2sum + g2sum)); + bound_value(w[i]); + g2sum += scaled_grad * scaled_grad; + } +} + +void StdAdaGradSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + sgd[g2sum_index() + i] = 0; + } +} + +void SparseAdamSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto adam_param = param.adam(); + learning_rate_ = adam_param.learning_rate(); + _initial_range = adam_param.initial_range(); + _beta1_decay_rate = adam_param.beta1_decay_rate(); + _beta2_decay_rate = adam_param.beta2_decay_rate(); + _ada_epsilon = adam_param.ada_epsilon(); + if (adam_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adam_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adam_param.weight_bounds_size(); + _min_bound = adam_param.weight_bounds(0); + _max_bound = adam_param.weight_bounds(1); + } +} + +void SparseAdamSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + float* gsum = sgd + gsum_index(); + float* g2sum = sgd + g2sum_index(); + float* beta1_pow = sgd + beta1_pow_index(); + float* beta2_pow = sgd + beta2_pow_index(); + const float* g = grad; + + float lr = learning_rate_; + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + + // lr not change in one update + lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); + for (int i = 0; i < _embedding_dim; i++) { + // Calculation + gsum[i] = _beta1_decay_rate * gsum[i] + (1 - _beta1_decay_rate) * g[i]; + g2sum[i] = + _beta2_decay_rate * g2sum[i] + (1 - _beta2_decay_rate) * g[i] * g[i]; + w[i] = w[i] - lr * (gsum[i] / (sqrt(g2sum[i]) + _ada_epsilon)); + bound_value(w[i]); + } + // update beta_pow_decay + (*beta1_pow) *= _beta1_decay_rate; + (*beta2_pow) *= _beta2_decay_rate; +} + +void SparseAdamSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } + // init rule gsum and g2sum + for (int i = gsum_index(); i < beta1_pow_index(); i++) { + sgd[i] = 0.0; + } + // init beta1_pow and beta2_pow + *(sgd + beta1_pow_index()) = _beta1_decay_rate; + *(sgd + beta2_pow_index()) = _beta2_decay_rate; +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.h b/paddle/fluid/distributed/table/sparse_sgd_rule.h new file mode 100644 index 0000000000000..ba2baa42f742a --- /dev/null +++ b/paddle/fluid/distributed/table/sparse_sgd_rule.h @@ -0,0 +1,134 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "glog/logging.h" // for CHECK +#include "paddle/fluid/distributed/common/local_random.h" // for local_uniform_real_distribution +#include "paddle/fluid/distributed/common/registerer.h" +#include "paddle/fluid/distributed/ps.pb.h" + +namespace paddle { +namespace distributed { + +class SparseValueSGDRule { + public: + SparseValueSGDRule() {} + virtual ~SparseValueSGDRule() {} + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + _name = param.name(); + } + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale) = 0; + virtual void init_value_work(float* value, float* sgd, bool zero_init) = 0; + virtual size_t dim() = 0; + const std::string& get_name() const { return _name; } + void init_value(float* value, float* sgd, bool zero_init = true) { + init_value_work(value, sgd, zero_init); + } + void update_value(float* w, float* sgd, const float* push_value, + float scale = 1) { + update_value_work(w, sgd, push_value, scale); + } + template + void bound_value(T& w) { // NOLINT + if (!(w >= _min_bound)) { + w = (T)_min_bound; + } else if (!(w <= _max_bound)) { + w = (T)_max_bound; + } + } + float& min_bound() { return _min_bound; } + float& max_bound() { return _max_bound; } + + protected: + float _min_bound; + float _max_bound; + float _initial_range; + size_t _embedding_dim; + + private: + std::string _name; +}; + +REGISTER_PSCORE_REGISTERER(SparseValueSGDRule); + +class SparseNaiveSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return 0; } + + private: + float learning_rate_; +}; + +class SparseAdaGradSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return 1; } + size_t g2sum_index() { return 0; } + + private: + float learning_rate_; + float _initial_g2sum; +}; + +class StdAdaGradSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return _embedding_dim; } + size_t g2sum_index() { return 0; } + + private: + float learning_rate_; + float _initial_g2sum; +}; + +class SparseAdamSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return _embedding_dim * 2 + 2; } + size_t gsum_index() { return 0; } + size_t g2sum_index() { return gsum_index() + _embedding_dim; } + size_t beta1_pow_index() { return g2sum_index() + _embedding_dim; } + size_t beta2_pow_index() { return beta1_pow_index() + 1; } + + protected: + float learning_rate_; + float _beta1_decay_rate; + float _beta2_decay_rate; + float _ada_epsilon; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index af87e1b6cc61d..832797ec2fc0e 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -20,3 +20,9 @@ cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_funct set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) + +set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) + +set_source_files_properties(sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc new file mode 100644 index 0000000000000..9c9f0ffcac321 --- /dev/null +++ b/paddle/fluid/distributed/test/feature_value_test.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include // NOLINT +#include + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/table/depends/feature_value.h" + +namespace paddle { +namespace distributed { + +TEST(BENCHMARK, LargeScaleKV) { + std::shared_ptr shard = + std::make_shared(); + uint64_t key = 1; + auto itr = shard->Find(key); + ASSERT_TRUE(itr == shard->end()); + + std::vector vec = {0.0, 0.1, 0.2, 0.3}; + + auto* feature_value = shard->Init(key); + feature_value->resize(vec.size()); + memcpy(feature_value->data(), vec.data(), vec.size() * sizeof(float)); + + itr = shard->Find(key); + ASSERT_TRUE(itr != shard->end()); + + feature_value = itr->second; + float* value_data = feature_value->data(); + + ASSERT_FLOAT_EQ(value_data[0], 0.0); + ASSERT_FLOAT_EQ(value_data[1], 0.1); + ASSERT_FLOAT_EQ(value_data[2], 0.2); + ASSERT_FLOAT_EQ(value_data[3], 0.3); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc new file mode 100644 index 0000000000000..e86234f1bd9c7 --- /dev/null +++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc @@ -0,0 +1,191 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" + +namespace paddle { +namespace distributed { + +TEST(sparse_value_naive_sgd_test, init_and_update) { + SparseNaiveSGDRule rule; + SparseCommonSGDRuleParameter param; + param.set_name("naive"); + auto* naive_param = param.mutable_naive(); + naive_param->set_learning_rate(0.1); + naive_param->set_initial_range(0.3); + naive_param->add_weight_bounds(-10.0); + naive_param->add_weight_bounds(10.0); + + rule.load_config(param, 10); + + // check init_value for zero + const int kItemSize = 10; + float w[kItemSize]; + float grad[kItemSize]; + rule.init_value(w, w + 9, true); + + for (auto i = 0u; i < kItemSize; ++i) { + ASSERT_FLOAT_EQ(w[i], 0); + } + + // check init_value for random + rule.init_value(w, w + 9, false); + for (auto i = 0u; i < kItemSize; ++i) { + ASSERT_TRUE(w[i] >= rule.min_bound() && w[i] <= rule.max_bound()); + } + + // check update_value for one field + for (auto i = 0u; i < kItemSize; ++i) { + w[i] = 0; + } + for (auto i = 0u; i < kItemSize; ++i) { + grad[i] = (i + 1) * 1.0; + } + float label[] = {-0.100000, -0.200000, -0.300000, -0.400000, -0.500000, + -0.600000, -0.700000, -0.800000, -0.900000, -1.000000}; + const float* ptr_grad = grad; + rule.update_value(w, w + 9, ptr_grad); + + for (auto i = 0u; i < kItemSize; ++i) { + VLOG(3) << w[i] << "\n"; + ASSERT_FLOAT_EQ(w[i], label[i]); + } +} + +TEST(downpour_sparse_adagrad_test, test_init_and_update) { + SparseAdaGradSGDRule rule; + SparseCommonSGDRuleParameter param; + param.set_name("adagrad"); + auto* adagrad_param = param.mutable_adagrad(); + adagrad_param->set_learning_rate(0.1); + adagrad_param->set_initial_g2sum(0.2); + adagrad_param->set_initial_range(0.3); + adagrad_param->add_weight_bounds(-10.0); + adagrad_param->add_weight_bounds(10.0); + + rule.load_config(param, 10); + + // check init_value for zero + const int kValueSize = 11; + int kEmbSize = 10; + float w[kValueSize]; + + rule.init_value(w, w + 10, true); + + for (auto i = 0u; i < kEmbSize; ++i) { + ASSERT_FLOAT_EQ(w[i], 0); + } + ASSERT_FLOAT_EQ(w[kEmbSize], 0); + + // check init_value for random + rule.init_value(w, w + 10, false); + for (auto i = 0u; i < kEmbSize; ++i) { + ASSERT_TRUE(w[i] >= rule.min_bound() && w[i] <= rule.max_bound()); + } + ASSERT_FLOAT_EQ(w[kEmbSize], 0); + + // check update_value for one field + for (auto i = 0u; i < kEmbSize; ++i) { + w[i] = 0; + } + w[kEmbSize] = 0; + float grad[kEmbSize]; + for (auto i = 0u; i < kEmbSize; ++i) { + grad[i] = (i + 1) * 1.0; + } + + const float* ptr_grad = grad; + rule.update_value(w, w + 10, ptr_grad); + float label[] = {-0.100000, -0.200000, -0.300000, -0.400000, + -0.500000, -0.600000, -0.700000, -0.800000, + -0.900000, -1.000000, 38.500000}; + for (auto i = 0u; i < kValueSize; ++i) { + ASSERT_FLOAT_EQ(w[i], label[i]); + } +} + +TEST(downpour_sparse_adam_test, test_init_and_update) { + const int embed_dim = 10; // dims of parameters + SparseCommonSGDRuleParameter param; + param.set_name("adam"); + auto* adam_param = param.mutable_adam(); + adam_param->set_learning_rate(0.1); + adam_param->set_initial_range(0.3); + adam_param->set_beta1_decay_rate(0.9); + adam_param->set_beta2_decay_rate(0.999); + adam_param->set_ada_epsilon(1e-08); + adam_param->add_weight_bounds(-10.0); + adam_param->add_weight_bounds(10.0); + + ASSERT_FLOAT_EQ(param.adam().learning_rate(), 0.1); + ASSERT_FLOAT_EQ(param.adam().initial_range(), 0.3); + ASSERT_FLOAT_EQ(param.adam().beta1_decay_rate(), 0.9); + ASSERT_FLOAT_EQ(param.adam().beta2_decay_rate(), 0.999); + ASSERT_FLOAT_EQ(param.adam().ada_epsilon(), 1e-08); + + SparseAdamSGDRule rule; + + rule.load_config(param, embed_dim); + + // check init_value for zero + const int rule_dim = + rule.dim(); // dims of gsum + g2sum + beta1_pow + beta2_pow in adam + const int value_dim = embed_dim + rule_dim; // total dims of w + rule + float* value = new float[value_dim]; + rule.init_value(value, value + embed_dim, true); + for (auto i = 0u; i < rule.beta1_pow_index(); ++i) { + ASSERT_FLOAT_EQ(value[i], 0); + } + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta1_pow_index()), 0.9); + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta2_pow_index()), 0.999); + + // check init_value for random + rule.init_value(value, value + embed_dim, false); + for (auto i = 0u; i < embed_dim; ++i) { + ASSERT_TRUE(value[i] >= rule.min_bound() && value[i] <= rule.max_bound()); + } + for (auto i = rule.gsum_index(); i < rule.beta1_pow_index(); ++i) { + ASSERT_FLOAT_EQ(value[i + embed_dim], 0); + } + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta1_pow_index()), 0.9); + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta2_pow_index()), 0.999); + + // check update_value + rule.init_value(value, value + embed_dim, true); + float* grad = new float[embed_dim]; + for (auto i = 0u; i < embed_dim; ++i) { + grad[i] = (i + 1) * 1.0; + } + + float label[] = {-0.0999999642, -0.099999994, -0.099999994, -0.099999994, + -0.099999994, -0.099999994, -0.099999994, -0.100000001, + -0.100000009, -0.100000001, 0.100000024, 0.200000048, + 0.300000072, 0.400000095, 0.500000119, 0.600000143, + 0.700000167, 0.800000191, 0.900000215, 1.00000024, + 0.000999987125, 0.0039999485, 0.00899988413, 0.015999794, + 0.0249996781, 0.0359995365, 0.0489993691, 0.063999176, + 0.0809989572, 0.0999987125, 0.809999943, 0.998001039}; + + rule.update_value(value, value + embed_dim, grad); + + for (auto i = 0u; i < value_dim; ++i) { // check update + ASSERT_FLOAT_EQ(value[i], label[i]) << "i is " << i; + } +} +} // namespace distributed +} // namespace paddle From 5eb640c6c3d9baa66e7a960f0d213420e2b792d4 Mon Sep 17 00:00:00 2001 From: seemingwang Date: Thu, 21 Oct 2021 11:19:01 +0800 Subject: [PATCH 057/116] Graph engine4 (#36587) --- .../distributed/service/graph_brpc_client.cc | 58 ++++- .../distributed/service/graph_brpc_client.h | 3 +- .../distributed/service/graph_brpc_server.cc | 204 +++++++++++++++++- .../distributed/service/graph_brpc_server.h | 9 + .../distributed/service/graph_py_service.cc | 1 + .../fluid/distributed/service/sendrecv.proto | 1 + paddle/fluid/distributed/service/server.h | 3 +- .../distributed/table/common_graph_table.cc | 18 +- .../distributed/table/common_graph_table.h | 5 +- .../fluid/distributed/test/graph_node_test.cc | 6 + 10 files changed, 292 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index 68d9c9669b697..9f65a66708def 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -304,7 +304,63 @@ std::future GraphBrpcClient::remove_graph_node( // char* &buffer,int &actual_size std::future GraphBrpcClient::batch_sample_neighboors( uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>> &res) { + std::vector>> &res, + int server_index) { + if (server_index != -1) { + res.resize(node_ids.size()); + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER) != + 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + size_t node_num = *(size_t *)buffer; + int *actual_sizes = (int *)(buffer + sizeof(size_t)); + char *node_buffer = buffer + sizeof(size_t) + sizeof(int) * node_num; + + int offset = 0; + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + int actual_size = actual_sizes[node_idx]; + int start = 0; + while (start < actual_size) { + res[node_idx].push_back( + {*(uint64_t *)(node_buffer + offset + start), + *(float *)(node_buffer + offset + start + + GraphNode::id_size)}); + start += GraphNode::id_size + GraphNode::weight_size; + } + offset += actual_size; + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + ; + closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)node_ids.data(), + sizeof(uint64_t) * node_ids.size()); + closure->request(0)->add_params((char *)&sample_size, sizeof(int)); + ; + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), + closure->response(0), closure); + return fut; + } std::vector request2server; std::vector server2request(server_size, -1); res.clear(); diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h index 8acb2047b8e97..1fbb3fa9b0550 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -64,7 +64,8 @@ class GraphBrpcClient : public BrpcPsClient { // given a batch of nodes, sample graph_neighboors for each of them virtual std::future batch_sample_neighboors( uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>>& res); + std::vector>>& res, + int server_index = -1); virtual std::future pull_graph_list(uint32_t table_id, int server_index, int start, diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc index 110d4406fc556..b404082f7c410 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -61,6 +61,10 @@ int32_t GraphBrpcServer::initialize() { return 0; } +brpc::Channel *GraphBrpcServer::get_cmd_channel(size_t server_index) { + return _pserver_channels[server_index].get(); +} + uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { std::unique_lock lock(mutex_); @@ -80,6 +84,42 @@ uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { return 0; } +int32_t GraphBrpcServer::build_peer2peer_connection(int rank) { + this->rank = rank; + auto _env = environment(); + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.timeout_ms = 500000; + options.connection_type = "pooled"; + options.connect_timeout_ms = 10000; + options.max_retry = 3; + + std::vector server_list = _env->get_ps_servers(); + _pserver_channels.resize(server_list.size()); + std::ostringstream os; + std::string server_ip_port; + for (size_t i = 0; i < server_list.size(); ++i) { + server_ip_port.assign(server_list[i].ip.c_str()); + server_ip_port.append(":"); + server_ip_port.append(std::to_string(server_list[i].port)); + _pserver_channels[i].reset(new brpc::Channel()); + if (_pserver_channels[i]->Init(server_ip_port.c_str(), "", &options) != 0) { + VLOG(0) << "GraphServer connect to Server:" << server_ip_port + << " Failed! Try again."; + std::string int_ip_port = + GetIntTypeEndpoint(server_list[i].ip, server_list[i].port); + if (_pserver_channels[i]->Init(int_ip_port.c_str(), "", &options) != 0) { + LOG(ERROR) << "GraphServer connect to Server:" << int_ip_port + << " Failed!"; + return -1; + } + } + os << server_ip_port << ","; + } + LOG(INFO) << "servers peer2peer connection success:" << os.str(); + return 0; +} + int32_t GraphBrpcService::clear_nodes(Table *table, const PsRequestMessage &request, PsResponseMessage &response, @@ -160,6 +200,9 @@ int32_t GraphBrpcService::initialize() { &GraphBrpcService::remove_graph_node; _service_handler_map[PS_GRAPH_SET_NODE_FEAT] = &GraphBrpcService::graph_set_node_feat; + _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] = + &GraphBrpcService::sample_neighboors_across_multi_servers; + // shard初始化,serveråÆåŠØåŽę‰åÆ从envčŽ·å–åˆ°server_listēš„shardäæ”ęÆ initialize_shard_info(); @@ -172,10 +215,10 @@ int32_t GraphBrpcService::initialize_shard_info() { if (_is_initialize_shard_info) { return 0; } - size_t shard_num = _server->environment()->get_ps_servers().size(); + server_size = _server->environment()->get_ps_servers().size(); auto &table_map = *(_server->table()); for (auto itr : table_map) { - itr.second->set_shard(_rank, shard_num); + itr.second->set_shard(_rank, server_size); } _is_initialize_shard_info = true; } @@ -209,7 +252,9 @@ void GraphBrpcService::service(google::protobuf::RpcController *cntl_base, int service_ret = (this->*handler_func)(table, *request, *response, cntl); if (service_ret != 0) { response->set_err_code(service_ret); - response->set_err_msg("server internal error"); + if (!response->has_err_msg()) { + response->set_err_msg("server internal error"); + } } } @@ -403,7 +448,156 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, return 0; } - +int32_t GraphBrpcService::sample_neighboors_across_multi_servers( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + // sleep(5); + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_random_sample request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t), + size_of_size_t = sizeof(size_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + int sample_size = *(uint64_t *)(request.params(1).c_str()); + // std::vector res = ((GraphTable + // *)table).filter_out_non_exist_nodes(node_data, sample_size); + std::vector request2server; + std::vector server2request(server_size, -1); + std::vector local_id; + std::vector local_query_idx; + size_t rank = get_rank(); + for (int query_idx = 0; query_idx < node_num; ++query_idx) { + int server_index = + ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + } + if (server2request[rank] != -1) { + auto pos = server2request[rank]; + std::swap(request2server[pos], + request2server[(int)request2server.size() - 1]); + server2request[request2server[pos]] = pos; + server2request[request2server[(int)request2server.size() - 1]] = + request2server.size() - 1; + } + size_t request_call_num = request2server.size(); + std::vector> local_buffers; + std::vector local_actual_sizes; + std::vector seq; + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_num; ++query_idx) { + int server_index = + ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_data[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + seq.push_back(request_idx); + } + size_t remote_call_num = request_call_num; + if (request2server.size() != 0 && request2server.back() == rank) { + remote_call_num--; + local_buffers.resize(node_id_buckets.back().size()); + local_actual_sizes.resize(node_id_buckets.back().size()); + } + cntl->response_attachment().append(&node_num, sizeof(size_t)); + auto local_promise = std::make_shared>(); + std::future local_fut = local_promise->get_future(); + std::vector failed(server_size, false); + std::function func = [&, node_id_buckets, query_idx_buckets, + request_call_num](void *done) { + local_fut.get(); + std::vector actual_size; + auto *closure = (DownpourBrpcClosure *)done; + std::vector> res( + remote_call_num); + size_t fail_num = 0; + for (size_t request_idx = 0; request_idx < remote_call_num; ++request_idx) { + if (closure->check_response(request_idx, PS_GRAPH_SAMPLE_NEIGHBOORS) != + 0) { + ++fail_num; + failed[request2server[request_idx]] = true; + } else { + auto &res_io_buffer = closure->cntl(request_idx)->response_attachment(); + size_t node_size; + res[request_idx].reset(new butil::IOBufBytesIterator(res_io_buffer)); + size_t num; + res[request_idx]->copy_and_forward(&num, sizeof(size_t)); + } + } + int size; + int local_index = 0; + for (size_t i = 0; i < node_num; i++) { + if (fail_num > 0 && failed[seq[i]]) { + size = 0; + } else if (request2server[seq[i]] != rank) { + res[seq[i]]->copy_and_forward(&size, sizeof(int)); + } else { + size = local_actual_sizes[local_index++]; + } + actual_size.push_back(size); + } + cntl->response_attachment().append(actual_size.data(), + actual_size.size() * sizeof(int)); + + local_index = 0; + for (size_t i = 0; i < node_num; i++) { + if (fail_num > 0 && failed[seq[i]]) { + continue; + } else if (request2server[seq[i]] != rank) { + char temp[actual_size[i] + 1]; + res[seq[i]]->copy_and_forward(temp, actual_size[i]); + cntl->response_attachment().append(temp, actual_size[i]); + } else { + char *temp = local_buffers[local_index++].get(); + cntl->response_attachment().append(temp, actual_size[i]); + } + } + closure->set_promise_value(0); + }; + + DownpourBrpcClosure *closure = new DownpourBrpcClosure(remote_call_num, func); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < remote_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS); + closure->request(request_idx)->set_table_id(request.table_id()); + closure->request(request_idx)->set_client_id(rank); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + closure->request(request_idx) + ->add_params((char *)&sample_size, sizeof(int)); + PsService_Stub rpc_stub( + ((GraphBrpcServer *)get_server())->get_cmd_channel(server_index)); + // GraphPsService_Stub rpc_stub = + // getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + if (server2request[rank] != -1) { + ((GraphTable *)table) + ->random_sample_neighboors(node_id_buckets.back().data(), sample_size, + local_buffers, local_actual_sizes); + } + local_promise.get()->set_value(0); + if (remote_call_num == 0) func(closure); + fut.get(); + return 0; +} int32_t GraphBrpcService::graph_set_node_feat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, @@ -412,7 +606,7 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, if (request.params_size() < 3) { set_response_code( response, -1, - "graph_set_node_feat request requires at least 2 arguments"); + "graph_set_node_feat request requires at least 3 arguments"); return 0; } size_t node_num = request.params(0).size() / sizeof(uint64_t); diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h index 6b4853fa67992..817fe08331165 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -32,6 +32,8 @@ class GraphBrpcServer : public PSServer { virtual ~GraphBrpcServer() {} PsBaseService *get_service() { return _service.get(); } virtual uint64_t start(const std::string &ip, uint32_t port); + virtual int32_t build_peer2peer_connection(int rank); + virtual brpc::Channel *get_cmd_channel(size_t server_index); virtual int32_t stop() { std::unique_lock lock(mutex_); if (stoped_) return 0; @@ -50,6 +52,7 @@ class GraphBrpcServer : public PSServer { mutable std::mutex mutex_; std::condition_variable cv_; bool stoped_ = false; + int rank; brpc::Server _server; std::shared_ptr _service; std::vector> _pserver_channels; @@ -113,12 +116,18 @@ class GraphBrpcService : public PsBaseService { int32_t print_table_stat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); + int32_t sample_neighboors_across_multi_servers( + Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + private: bool _is_initialize_shard_info; std::mutex _initialize_shard_mutex; std::unordered_map _msg_handler_map; std::vector _ori_values; const int sample_nodes_ranges = 23; + size_t server_size; + std::shared_ptr<::ThreadPool> task_pool; }; } // namespace distributed diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc index b415962701317..498805136417f 100644 --- a/paddle/fluid/distributed/service/graph_py_service.cc +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -107,6 +107,7 @@ void GraphPyServer::start_server(bool block) { empty_vec.push_back(empty_prog); pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec); pserver_ptr->start(ip, port); + pserver_ptr->build_peer2peer_connection(rank); std::condition_variable* cv_ = pserver_ptr->export_cv(); if (block) { std::mutex mutex_; diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index 696c950d9b33b..42e25258ec3fe 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -56,6 +56,7 @@ enum PsCmdID { PS_GRAPH_ADD_GRAPH_NODE = 35; PS_GRAPH_REMOVE_GRAPH_NODE = 36; PS_GRAPH_SET_NODE_FEAT = 37; + PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38; } message PsRequestMessage { diff --git a/paddle/fluid/distributed/service/server.h b/paddle/fluid/distributed/service/server.h index 89b089386f501..dffe19545ce52 100644 --- a/paddle/fluid/distributed/service/server.h +++ b/paddle/fluid/distributed/service/server.h @@ -147,7 +147,7 @@ class PsBaseService : public PsService { public: PsBaseService() : _rank(0), _server(NULL), _config(NULL) {} virtual ~PsBaseService() {} - + virtual size_t get_rank() { return _rank; } virtual int32_t configure(PSServer *server) { _server = server; _rank = _server->rank(); @@ -167,6 +167,7 @@ class PsBaseService : public PsService { } virtual int32_t initialize() = 0; + PSServer *get_server() { return _server; } protected: size_t _rank; diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 41f4b0dac4d96..2c20e79b3b2d3 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -305,12 +305,12 @@ Node *GraphTable::find_node(uint64_t id) { return node; } uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { - return node_id % shard_num % shard_num_per_table % task_pool_size_; + return node_id % shard_num % shard_num_per_server % task_pool_size_; } uint32_t GraphTable::get_thread_pool_index_by_shard_index( uint64_t shard_index) { - return shard_index % shard_num_per_table % task_pool_size_; + return shard_index % shard_num_per_server % task_pool_size_; } int32_t GraphTable::clear_nodes() { @@ -575,6 +575,11 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, actual_size = size; return 0; } + +int32_t GraphTable::get_server_index_by_id(uint64_t id) { + return id % shard_num / shard_num_per_server; +} + int32_t GraphTable::initialize() { _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { @@ -611,13 +616,12 @@ int32_t GraphTable::initialize() { shard_num = _config.shard_num(); VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" << _shard_idx; - shard_num_per_table = sparse_local_shard_num(shard_num, server_num); - shard_start = _shard_idx * shard_num_per_table; - shard_end = shard_start + shard_num_per_table; + shard_num_per_server = sparse_local_shard_num(shard_num, server_num); + shard_start = _shard_idx * shard_num_per_server; + shard_end = shard_start + shard_num_per_server; VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start " << shard_start << " shard_end " << shard_end; - // shards.resize(shard_num_per_table); - shards = std::vector(shard_num_per_table, GraphShard(shard_num)); + shards = std::vector(shard_num_per_server, GraphShard(shard_num)); return 0; } } // namespace distributed diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index f643337a80f7c..d681262c66480 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -94,6 +94,7 @@ class GraphTable : public SparseTable { int32_t remove_graph_node(std::vector &id_list); + int32_t get_server_index_by_id(uint64_t id); Node *find_node(uint64_t id); virtual int32_t pull_sparse(float *values, @@ -128,9 +129,11 @@ class GraphTable : public SparseTable { const std::vector &feature_names, const std::vector> &res); + size_t get_server_num() { return server_num; } + protected: std::vector shards; - size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; + size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; const int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 810530cdbec94..613770220f9d7 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -138,6 +138,10 @@ void testSingleSampleNeighboor( for (auto g : s) { ASSERT_EQ(true, s1.find(g) != s1.end()); } + vs.clear(); + pull_status = worker_ptr_->batch_sample_neighboors(0, {96, 37}, 4, vs, 0); + pull_status.wait(); + ASSERT_EQ(vs.size(), 2); } void testAddNode( @@ -356,6 +360,7 @@ void RunServer() { pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec); LOG(INFO) << "first server, run start(ip,port)"; pserver_ptr_->start(ip_, port_); + pserver_ptr_->build_peer2peer_connection(0); LOG(INFO) << "init first server Done"; } @@ -373,6 +378,7 @@ void RunServer2() { empty_vec2.push_back(empty_prog2); pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2); pserver_ptr2->start(ip2, port2); + pserver_ptr2->build_peer2peer_connection(1); } void RunClient( From 921c0917a37b6d5012f6290b6c061a1266d10a22 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 21 Oct 2021 11:45:38 +0800 Subject: [PATCH 058/116] Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 (#36373) * Update the implement of reduceAnyKernel according to kernel primitive api * Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 --- .../elementwise/elementwise_op_broadcast.cu.h | 2 +- .../fluid/operators/fused/attn_bias_add.cu.h | 4 +- .../kernel_primitives/compute_primitives.h | 74 +++-- .../kernel_primitives/datamover_primitives.h | 286 +++++++++++++----- .../fluid/operators/reduce_ops/reduce_op.cu.h | 59 ++-- 5 files changed, 286 insertions(+), 139 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 53ac85802a6f4..549a6be0b4507 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData( // num: how many data will be deal with in this time if (need_broadcast) { kps::ReadDataBc(dst, src, block_offset, - config, numel, 1, 1); + config, numel); } else { kps::ReadData(dst, src + block_offset, num); } diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index fa3eb19b29995..18ae932c9325a 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -72,14 +72,14 @@ __global__ void BroadcastKernelBinary( // load in0 if (use_broadcast[0]) { kernel_primitives::ReadDataBc( - arg0, in0, fix, configlists[0], numel, 1, 1); + arg0, in0, fix, configlists[0], numel); } else { kernel_primitives::ReadData(arg0, in0 + fix, num); } // load in1 if (use_broadcast[1]) { kernel_primitives::ReadDataBc( - arg1, in1, fix, configlists[1], numel, 1, 1); + arg1, in1, fix, configlists[1], numel); } else { kernel_primitives::ReadData(arg1, in1 + fix, num); } diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index a36c76d788173..73316d66b6cf2 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { } // namespace details /** - * @brief Perform unary calculation according to OpFunc. Size of input and + * @brief Perform unary calculation according to OpFunc. Shape of input and * output are the same. * * @template paraments - * InT: Data type of in. - * OutT: Data type of out. + * InT: The data type of in. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: * template * struct XxxFunctor { @@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, } /** - * @brief Binary calculation according to OpFunc. Size of The input and output + * @brief Binary calculation according to OpFunc. Shape of The input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. - * NX: The number of data columns loaded by each thread. - * NY: The number of data rows loaded by each thread. + * InT: The data type of in1 and in2. + * OutT: The data type of out. + * NX: The number of data columns computed by each thread. + * NY: The number of data rows computed by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { + * HOSTDEVICE InT operator()(const InT& a, const InT& b) const { * return ...; * } * }; @@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * out: The register pointer of out, the size is NX * NY. * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, } /** - * @brief Ternary calculation according to OpFunc. Size of input and output + * @brief Ternary calculation according to OpFunc. Shape of input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c) + * HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c) * const { * return ...; * } @@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. * in3: The register pointer of third input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, } /** - * @brief Multivariate calculation according to OpFunc. Size of input and output - * are the same. + * @brief Multivariate calculation according to OpFunc. Shape of inputs and + * output are the same. * * @template paraments - * InT: Data type of in1, in2 and in3. - * OutT: Data type of out. + * InT: The data type of in1, in2 and in3. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. - * Arity: The size of ins + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Arity: The size of ins. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT* args) const { + * HOSTDEVICE InT operator()(const InT* args) const { * return ...; * } * }; * * @param * out: The register pointer of out, the size is NX * NY. - * ins: An array of pointers consisting of multiple inputs. - * compute: Compute function which was declared like OpFunc(). + * ins: A pointers of array consisting of multiple inputs. + * compute: Compute function which was declared like OpFunc(). */ template @@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * shape is [NY, NX]. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following * template * struct XxxFunctor { @@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, * NX: The number of data continuously loaded by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * ReduceFunctor: Compute functor which has an operator() as following * template * struct ReduceFunctor { diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index c720bedf0a3af..860072bd0c52e 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -118,8 +118,8 @@ struct BroadcastConfig { } // namespace details /** - * @brief Read 2D data from global memory to registers according to Tx type, and - * store it as Ty type. + * @brief Read 2D data from global memory to register according to Tx type, and + * store it as Ty type into register. * * @template paraments * Tx: The type of data stored in the global memory. @@ -127,8 +127,7 @@ struct BroadcastConfig { * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than * NX x NY x blockDim, boundary judgment is required to avoid memory access @@ -136,20 +135,20 @@ struct BroadcastConfig { * * @paramļ¼š * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. - * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter - * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * src: The data pointer of the current block. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, int size_nx, int size_ny, int stride_nx, int stride_ny) { - int thread_offset = threadIdx.x * NX; + int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; // Each branch is added for better performance @@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } @@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { } /** - * @brief Read 2D data from global memory to registers. When IsBoundary = true + * @brief Read 1D data from global memory to register. When IsBoundary = true * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to * improve memory access efficiency. * * @template paraments - * T: Data type of src and dst. - * NX: The number of data continuously loaded by each thread. - * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * T: The type of data. + * NX: Each thread load NX data from global memory continuously. + * NY: Each thread need to load NY rows, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. * When the number of data processed by this block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @paramļ¼š * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. + * src: The data pointer of the current block. * size: The current block needs to load size data continuously. */ template @@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, } /** - * @brief Read 2D data from global memory to registers for broadcast. + * @brief Read 2D data from global memory to registers with broadcast form. * * @template paraments * T: The type of data stored in the global memory. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @paramļ¼š * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The original input data pointer of this kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * config: Calculation configuration of broadcast. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. + * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template @@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc( T* dst, const T* __restrict__ src, uint32_t block_offset, details::BroadcastConfig config, int total_num_output, int stride_nx, int stride_ny) { - uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t thread_offset = block_offset + threadIdx.x; uint32_t index_src = 0; #pragma unroll @@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc( } /** - * @brief Read 2D data from global memory to registers for reduce. + * @brief Read 2D data from global memory to register with reduce form. * * @template paraments - * T: The type of data stored in the global memory. + * T: The type of data. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @paramļ¼š * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The input data pointer of this block. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * index_cal: Calculation configuration of Reduce. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. - * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; - * index_cal: get the global index in src, attention config was declared in - * host; + * coordinate mapping relationship between output data and input data. * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter + * parameter will participate in the calculation when isboundary = true. + * size_ny: The current block needs to load size_ny rows of data, this parameter + * will participate in the calculation when isboundary = true. * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx columns. + * stride_ny: Each read one element stride stride_ny raws. * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ @@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce( const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, int stride_ny, bool reduce_last_dim) { int thread_offset = 0; + int left_idx = 0; if (reduce_last_dim) { - thread_offset = block_offset + threadIdx.x; + thread_offset = threadIdx.x; + left_idx = threadIdx.y; } else { - thread_offset = block_offset + threadIdx.y; + thread_offset = threadIdx.y; + left_idx = threadIdx.x; } if (NX == 1) { @@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce( break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[ny] = src[index_src]; thread_offset += stride_ny; } } else { #pragma unroll for (int nx = 0; nx < NX; ++nx) { - if (IsBoundary) { - if (nx * stride_nx >= size_nx) { - break; - } - } #pragma unroll for (int ny = 0; ny < NY; ++ny) { if (IsBoundary) { - if (nx * stride_nx >= size_nx) { + if ((thread_offset >= size_ny) || + (left_idx + nx * stride_nx >= size_nx)) { break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[nx + ny * NX] = src[index_src]; thread_offset += stride_ny; } - thread_offset += stride_nx; } } } @@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce( * * @template paraments * T: The type of data. - * NX: The number of data continuously loaded by each thread. + * NX: The number of data continuously writed by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @paramļ¼š - * dst: Data pointer of the current block. - * src: The register pointer of the thread, the size is NX * NY. - * size: The current block needs to load size data continuously. + * dst: The data pointer of the current block. + * src: The register pointer, the size is NX * NY. + * size: The current block needs to load size elements continuously. */ template __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, @@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, } } +/** + * @brief Write 2D data from register to global memory according to Tx type, and + * store it as Ty type. + * + * @template paraments + * Tx: The type of data that needs to be stored in registers. + * Ty: The type of data that stored in the global memory. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @paramļ¼š + * dst: The data pointer of the current block. + * src: The register pointer of the thread, the size is NX * NY. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. + */ +template +__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src, + int size_nx, int size_ny, + int stride_nx, int stride_ny) { + int thread_offset = threadIdx.x; + int left_size_nx = size_nx - thread_offset; + + // Each branch is added for better performance + if (NX == 1 && NY == 1) { // for NX == 1 and NY == 1 + if (IsBoundary) { + if (left_size_nx > 0) { + dst[thread_offset] = static_cast(src[0]); + } + } else { + dst[thread_offset] = static_cast(src[0]); + } + } else if (NX == 1) { // for NX == 1 and NY != 1 +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idy * stride_ny] = static_cast(src[idy]); + } + } else if (NY == 1) { // for NY == 1 and NX != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } + dst[thread_offset + idx * stride_nx] = static_cast(src[idx]); + } + } else { // for NX != 1 and NY != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idx * stride_nx + idy * stride_ny] = + static_cast(src[idy * NX + idx]); + } + } + } +} + +/** + * @brief Initialize register with init_data. + * + * @template paraments + * T: Data type of register. + * NX: Number of data to initialize. + * + * @paramļ¼š + * dst: The register pointer of the thread, the size is NX. + * init_data: The register pointer of init data, the size is NX. + */ +template +__device__ __forceinline__ void Init(T* dst, T* init_data, int num) { +#pragma unroll + for (int i = 0; i < NX; i++) { + if (IsBoundary) { + if (i >= num) { + break; + } + } + dst[i] = init_data[i]; + } +} + +/** + * @brief Read 1D data from global memory to register with broadcast form. + * + * @template paraments + * T: The type of data stored in the global memory. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @paramļ¼š + * dst: The register pointer of the thread, the size is NX * NY. + * src: The original input data pointer of kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX; + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * total_num_output: Total number of original output. + */ +template +__device__ __forceinline__ void ReadDataBc( + T* dst, const T* __restrict__ src, uint32_t block_offset, + details::BroadcastConfig config, int total_num_output) { + uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t index_src = 0; + +#pragma unroll + for (uint32_t nx = 0; nx < NX; ++nx) { + uint32_t index_output = thread_offset + nx; + index_src = 0; + if (IsBoundary) { + if (index_output >= total_num_output) { + break; + } + } +#pragma unroll + for (int i = 0; i < Rank; ++i) { + auto fast_divmoder = config.divmoders[i].Divmod(index_output); + index_output = fast_divmoder.val[0]; + index_src += fast_divmoder.val[1] * config.strides[i]; + } + dst[nx] = src[index_src]; + } +} + } // namespace kernel_primitives } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 28b6ebc243322..bf451272a47b0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -529,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer, kps::WriteData(y + store_offset, &temp_data, size); } +template +__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var, + ReduceOp reducer, TransformOp transformer, + MPType init, int reduce_num, int input_idx, + bool reduce_last_dim, + const Calculator& reduce_index_calculator, + int stride, int num) { + Tx input_reg[REDUCE_VEC_SIZE]; + MPType input_compute[REDUCE_VEC_SIZE]; + MPType input_transform[REDUCE_VEC_SIZE]; + + kps::Init(&input_compute[0], init); + kps::ReadDataReduce( + &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, + 1, stride, reduce_last_dim); + kps::ElementwiseUnary( + &input_transform[0], &input_reg[0], transformer); + kps::Init(input_compute, input_transform, + num); + kps::Reduce( + reduce_var, &input_compute[0], reducer, reduce_last_dim); +} + // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used @@ -570,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer, // 1. reduce for each thread if (left_idx < left_num) { // load REDUCE_VEC_SIZE data once, and then compute - Tx input_reg[REDUCE_VEC_SIZE]; - MPType input_compute[REDUCE_VEC_SIZE]; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; for (; input_idx + block_size < bound; input_idx += REDUCE_VEC_SIZE * stride) { - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, - reduce_num, 1, stride, reduce_last_dim); - kps::ElementwiseUnary( - &input_compute[0], &input_reg[0], transformer); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - } - - kps::Init(&input_compute[0], init); - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, - 1, stride, reduce_last_dim); - input_idx += tid; -#pragma unroll - for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { - if (input_idx >= reduce_num) { - break; - } - input_compute[i] = static_cast(transformer(input_reg[i])); - input_idx += stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num, input_idx, + reduce_last_dim, reduce_index_calculator, stride, reduce_num); } - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); + int num = (reduce_num - input_idx - tid + stride - 1) / stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num - input_idx, + input_idx, reduce_last_dim, reduce_index_calculator, stride, num); } kps::Reduce( From b6e7f8e9365b0c092f9790722d3896979c82b14a Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 21 Oct 2021 14:07:13 +0800 Subject: [PATCH 059/116] User specified backend (#35745) --- paddle/fluid/framework/fleet/gloo_wrapper.h | 18 ++ paddle/fluid/imperative/gloo_context.cc | 115 ++++++++++- paddle/fluid/imperative/gloo_context.h | 8 + python/paddle/distributed/fleet/launch.py | 51 ++++- .../paddle/distributed/fleet/launch_utils.py | 63 +++++- python/paddle/distributed/parallel.py | 27 +-- python/paddle/distributed/spawn.py | 88 +++++++-- python/paddle/distributed/utils.py | 22 ++- .../fluid/tests/unittests/CMakeLists.txt | 18 ++ .../parallel_dygraph_gradient_check.py | 3 +- .../unittests/parallel_dygraph_se_resnext.py | 1 + .../tests/unittests/test_cpuonly_launch.sh | 42 ++++ .../tests/unittests/test_cpuonly_spawn.py | 72 +++++++ .../fluid/tests/unittests/test_dist_base.py | 179 +++++++++++++++++- .../test_parallel_dygraph_dataparallel.py | 65 +++++++ ..._parallel_dygraph_sparse_embedding_gloo.py | 59 ++++++ ...graph_sparse_embedding_over_height_gloo.py | 44 +++++ .../test_parallel_dygraph_transformer_gloo.py | 61 ++++++ ..._parallel_dygraph_unused_variables_gloo.py | 72 +++++++ .../test_spawn_and_init_parallel_env.py | 5 +- 20 files changed, 948 insertions(+), 65 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh create mode 100644 python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h index eafc991fbca0a..f1ec042dbd705 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.h +++ b/paddle/fluid/framework/fleet/gloo_wrapper.h @@ -238,6 +238,24 @@ class GlooWrapper { return ret; } + // TODO(xiongkun03): support all gather array of + // numbers with different length + // can use AllgathervOptions, may be work in different + // occasion. Need some survey. + template + void AllGatherVector(T* input_ptr, T* output_ptr, + size_t element_num) { // NOLINT + CHECK_EQ(is_initialized_, true); +#ifdef PADDLE_WITH_GLOO + gloo::AllgatherOptions opts(context_); + opts.setInput(input_ptr, element_num); + opts.setOutput(output_ptr, element_num * size_); + gloo::allgather(opts); +#else + LOG(WARNING) << "AllGather does nothing when WITH_GLOO=OFF"; +#endif + } + protected: bool is_initialized_ = false; #ifdef PADDLE_WITH_GLOO diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index d7df6ec3c1164..0d93cdf57932f 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { @@ -67,8 +68,36 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { // AllReduce(src, dst, strategy_, ring_id, use_calc_stream); - auto src_tensor = src.Get(); - auto *dst_tensor = dst->GetMutable(); + if (src.IsType()) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable()); + } else if (src.IsType()) { + if (&src != dst) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable()); + } else { + // SelectedRows cannot be allreduce in-place + framework::Variable tmp_dst; + AllReduce(src.Get(), + tmp_dst.GetMutable()); + *dst = std::move(tmp_dst); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported variable type %s for imperative allreduce, only " + "LoDTensor and SelectedRows are supported.", + platform::demangle(framework::ToTypeName(src.Type())))); + } +} + +void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor, + framework::Tensor *dst_tensor) { auto gloo_wrapper = framework::GlooWrapper::GetInstance(); dst_tensor->Resize(src_tensor.dims()); switch (src_tensor.type()) { @@ -84,6 +113,88 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, gloo_wrapper->Barrier(); } +#define GLOO_ALL_GATHER_CASE(type, T, gw) \ + case type: { \ + const auto *src_tensor_ptr = src_tensor.data(); \ + gw->AllGatherVector(const_cast(src_tensor_ptr), \ + reinterpret_cast(dst_tensor_ptr), \ + value_sendcount); \ + break; \ + } + +void GLOOParallelContext::AllReduce(const framework::SelectedRows &src, + framework::SelectedRows *dst) { + // auto ; + // int local_rank = strategy_.local_rank_; + int nranks = strategy_.nranks_; + VLOG(3) << "SelectedRows AllReduce start"; + const auto &src_tensor = src.value(); + const auto &place = src_tensor.place(); + auto dtype = src_tensor.type(); + // 1. Gather rows number from all workers. Here use ncclAllGather to do this, + // but we can use other ways to implement is in the future + const auto &src_rows = src.rows(); + auto gloo_wrapper = framework::GlooWrapper::GetInstance(); + size_t local_row_num = src_rows.size(); + std::vector rows_num_vector = + gloo_wrapper->AllGather(local_row_num); + const auto *cpu_rows_num_ptr = rows_num_vector.data(); + auto rows_num = std::accumulate(cpu_rows_num_ptr, cpu_rows_num_ptr + nranks, + static_cast(0)); + dst->set_height(src.height()); + VLOG(3) << "Gather rows: " << string::join_strings(rows_num_vector, ',') + << ", total rows number: " << rows_num + << ", height: " << src.height(); + auto *dst_rows = dst->mutable_rows(); + dst_rows->resize(rows_num); + auto *dst_rows_ptr = dst_rows->MutableData(place); + const int64_t *src_rows_ptr = src_rows.Data(place); + + // VLOG(3) << "Selected Rows of src:" << string::join_strings(dst_rows, ',') + + auto *dst_tensor = dst->mutable_value(); + auto dims = src_tensor.dims(); + dims[0] = rows_num; + auto feature_size = framework::product(dims) / dims[0]; + dst_tensor->Resize(dims); + if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + nranks, + [&](size_t row) { return row == cpu_rows_num_ptr[0]; })) { + // During sparse communication, the number of each card is same. + // Because gloo wrapper utility class currently don't support + // broadcast, so we only deal the-same case. + VLOG(3) << "Use the gloo all reduce to sync. SRC:" << src_tensor; + // framework::SerializeToStream(VLOG(4), src); + VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce"; + auto value_sendcount = cpu_rows_num_ptr[0] * feature_size; + auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype); + + gloo_wrapper->AllGatherVector(const_cast(src_rows_ptr), + static_cast(dst_rows_ptr), + rows_num_vector[0]); + + switch (dtype) { + GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP32, float, + gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP64, double, + gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT32, int, gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT64, int64_t, + gloo_wrapper); + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid datatype for allreduce")); + } + } + VLOG(3) << "Selected Row DST:" << *dst_tensor; + VLOG(3) << "Selected Rows of DST:" + << string::join_strings(std::vector(*dst_rows), ','); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The number of each card is not the same, gloo only support the-same" + "batch division")); + } +} + paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext( int ring_id) { // return the CPUDeviceContext diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index f54dc1a406a92..305a75a881153 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -16,6 +16,9 @@ #include #include #include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/platform/device_context.h" @@ -52,6 +55,11 @@ class GLOOParallelContext : public ParallelContext { void SynchronizeCompute() override; + private: + void AllReduce(const framework::Tensor& src, framework::Tensor* dst); + void AllReduce(const framework::SelectedRows& src, + framework::SelectedRows* dst); + private: std::unique_ptr device_; }; diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index c0a1c359d17c6..16b39e0fc8e45 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -103,7 +103,12 @@ def _parse_args(): type=str, default="log", help="The path for each process's log. Default --log_dir=log/") - + base_group.add_argument( + "--backend", + type=str, + default="auto", + help="Specifize the backend, can be gloo|nccl|bkcl|auto. Default value is auto which perfers nccl or bkcl." + ) base_group.add_argument( "--nproc_per_node", type=int, @@ -230,8 +235,21 @@ def get_cluster_from_args(args, device_mode, devices_per_proc): devices_per_proc) +def cpuonly_check(args): + if args.ips and len(args.ips.split(',')) > 1: + raise RuntimeError( + "CPUONLY launch only support single trainer, that is len(ips)=1, but got %s." + % args.ips) + if args.run_mode: + assert args.run_mode == 'cpuonly', "CPUONLY launch only support run mode is CPUONLY" + if args.servers: + raise RuntimeError("CPUONLY launch can't have --servers as arguments.") + return True + + def launch_collective(args): # parse arguments, used for cloud-single-machine and local + if args.backend == 'gloo': cpuonly_check(args) (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) trainers_num = cloud_utils.get_trainers_num() logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( @@ -265,6 +283,7 @@ def launch_collective(args): global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir + global_envs["PADDLE_DISTRI_BACKEND"] = args.backend procs = start_local_trainers( cluster, @@ -349,9 +368,12 @@ def which_distributed_mode(args): if fluid.core.is_compiled_with_cuda(): accelerators = fluid.core.get_cuda_device_count() + args.backend = 'nccl' elif fluid.core.is_compiled_with_npu(): + args.backend = 'unknown' accelerators = fluid.core.get_npu_device_count() elif fluid.core.is_compiled_with_xpu(): + args.backend = 'bkcl' accelerators = fluid.core.get_xpu_device_count() else: accelerators = 0 @@ -372,10 +394,14 @@ def which_distributed_mode(args): else: if not fluid.core.is_compiled_with_cuda( ) and not fluid.core.is_compiled_with_xpu(): - logger.warning( - "Not found distinct arguments and not compiled with cuda or xpu. Default use ps mode" - ) - return DistributeMode.PS + if args.servers: + logger.warning( + "Not found distinct arguments and not compiled with cuda or xpu. \ +But found args.servers not empty, default use ps mode") + return DistributeMode.PS + else: + args.backend = "gloo" + return DistributeMode.COLLECTIVE else: logger.warning( "Not found distinct arguments and compiled with cuda or xpu. Default use collective mode" @@ -556,7 +582,20 @@ def launch(): logger = get_logger() _print_arguments(args) - distribute_mode = which_distributed_mode(args) + if args.backend == 'auto': + distribute_mode = which_distributed_mode(args) + assert args.backend in [ + 'gloo', 'nccl', 'bkcl', 'unknown' + ] # which_distributed_mode must modify args.backend + else: + assert args.run_mode == 'collective' or args.run_mode == None, "When backend is not 'auto', run mode must be collective" + check_backend(args.backend) + distribute_mode = DistributeMode.COLLECTIVE + + block_windows_and_macos( + args.backend) # raise error when using gloo on windows or macos + if args.backend == 'gloo': + logger.warning("launch start with CPUONLY mode") if enable_elastic(args, distribute_mode): launch_elastic(args, distribute_mode) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index e114670440c06..3aced0ab996cb 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -22,6 +22,7 @@ import tempfile import shutil from contextlib import closing +import multiprocessing import socket import warnings import six @@ -30,6 +31,7 @@ import paddle import paddle.fluid as fluid from distutils.util import strtobool +import paddle.utils.cpp_extension.extension_utils as utils logger = logging.getLogger("root") logger.propagate = False @@ -669,29 +671,31 @@ def get_xpus(xpus): return res_xpus -def get_device_mode(): +def get_device_mode(backend): if fluid.core.is_compiled_with_npu() and \ fluid.core.get_npu_device_count() > 0: print("launch train in ascend npu mode!") return DeviceMode.ASCEND_NPU - if fluid.core.is_compiled_with_cuda() and \ + if backend == 'nccl' and \ fluid.core.get_cuda_device_count() > 0: print("launch train in GPU mode!") return DeviceMode.GPU - if fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count( - ) > 0: + if backend == 'bkcl' and fluid.core.get_xpu_device_count() > 0: print("launch train in XPU mode") return DeviceMode.XPU - print("launch train in CPU mode") - return DeviceMode.CPU + if backend == 'gloo': + print("launch train in CPU mode") + return DeviceMode.CPU + + raise RuntimeError("Don't supported devices") def get_device_proc_info(args): # device_mode - device_mode = get_device_mode() + device_mode = get_device_mode(args.backend) # devices devices_per_proc = [] @@ -722,6 +726,9 @@ def get_device_proc_info(args): else: devices_per_proc = xpus elif device_mode == DeviceMode.CPU: + if hasattr(args, "paddle_cpuonly") and args.nproc_per_node is None: + #NOTE (xiongkun03) set it to cpu core number + args.nproc_per_node = multiprocessing.cpu_count() if args.nproc_per_node is None: devices_per_proc = [0] else: @@ -1237,3 +1244,45 @@ def start_pod_heter_worker(self, args, pod): tp.cmd = cmd self.procs["heter_worker"].append(tp) + + +def check_backend(backend): + if backend not in ['nccl', 'gloo', 'bkcl', 'auto']: + raise ValueError( + "paddle.distributed initialize error, " + "backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s" + % backend) + + if backend == 'nccl' and not fluid.core.is_compiled_with_cuda(): + raise ValueError( + "paddle.distributed initialize error, " + "your paddle is not compiled with cuda but you assign 'nccl' as backend." + ) + + if backend == 'bkcl' and not fluid.core.is_compiled_with_xpu(): + raise ValueError( + "paddle.distributed initialize error, " + "your paddle is not compiled with xpu but you assign 'bkcl' as backend." + ) + + +def block_windows_and_macos(backend): + if backend != 'gloo': return + if utils.OS_NAME.startswith('darwin'): # MACOS , block + raise ValueError( + "You are going to using gloo on macos, but currently is not supported" + ) + if utils.IS_WINDOWS: # MACOS , block + raise ValueError( + "You are going to using gloo on windows, but currently is not supported" + ) + + +def get_backend_by_compile_flag(): + if fluid.core.is_compiled_with_cuda(): + return 'nccl' + + if fluid.core.is_compiled_with_xpu(): + return 'bkcl' + + return 'gloo' diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 7789b17429c4e..34c74ad30679e 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -26,6 +26,7 @@ from paddle.fluid import core from paddle.fluid.framework import _set_expected_place from paddle.fluid.dygraph import parallel_helper +from paddle.distributed.fleet.launch_utils import check_backend from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.distributed.fleet.base.private_helper_function import wait_server_ready # noqa: F401 @@ -55,25 +56,8 @@ def _start_kv_server(port, http_server_d, size): http_server.stop() -def _check_backend(backend): - if backend not in ['nccl', 'gloo', 'bkcl', 'auto']: - raise ValueError( - "paddle.distributed initialize error, " - "backend argument can only be one of 'nccl', 'gloo', 'bkcl', 'auto', but got %s" - % backend) - - if backend == 'nccl' and not core.is_compiled_with_cuda(): - raise ValueError( - "paddle.distributed initialize error, " - "your paddle is not compiled with cuda but you assign 'nccl' as backend." - ) - - if backend == 'bkcl' and not core.is_compiled_with_xpu(): - raise ValueError( - "paddle.distributed initialize error, " - "your paddle is not compiled with xpu but you assign 'bkcl' as backend." - ) - +def _is_cpuonly(backend): + check_backend(backend) if backend in ['auto', 'nccl', 'bkcl'] and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()): # passes 'auto' and can use cuda or xpu, use the default logics. so return False @@ -82,7 +66,7 @@ def _check_backend(backend): return True -def init_parallel_env(backend='auto'): +def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. @@ -154,7 +138,8 @@ def train(): return # NOTE(xiongkun): support cpu gloo only, add this environment variable to # enable cpu only gloo prarllel training) - is_cpu_only = _check_backend(backend) + backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') + is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or core.is_compiled_with_xpu()): diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index a60e4642e494d..cea831d9d90b5 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -24,8 +24,10 @@ from paddle.distributed.utils import _print_arguments from paddle.distributed.utils import _prepare_trainer_env from paddle.distributed.utils import get_host_name_ip -from paddle.distributed.cloud_utils import get_cluster_and_pod +from paddle.distributed.cloud_utils import get_cluster_and_pod, _get_trainers_num +from paddle.distributed.fleet.launch import get_cluster_from_args from paddle.distributed.fleet.cloud_utils import use_paddlecloud +from paddle.distributed.fleet.launch_utils import DeviceMode, check_backend, block_windows_and_macos from paddle.device import get_device # deprecated module import @@ -71,7 +73,9 @@ def _py_supported_check(): def _options_valid_check(options): # `print_config` keeped as a debug options, not show to users - supported_options = ['start_method', 'ips', 'gpus', 'xpus', 'print_config'] + supported_options = [ + 'start_method', 'ips', 'gpus', 'xpus', 'print_config', 'backend' + ] deprecated_options = [ 'selected_devices', 'started_port', 'cluster_node_ips', 'node_ip', 'use_paddlecloud' @@ -95,6 +99,22 @@ def _get_default_nprocs(): return core.get_cuda_device_count() elif 'xpu' in device: return core.get_xpu_device_count() + elif 'cpu' in device: + return multiprocessing.cpu_count() + else: + raise RuntimeError( + "`paddle.distributed.spawn` does not support parallel training on device `{}` now.". + format(device)) + + +def _get_default_backend(): + device = get_device() + if 'gpu' in device: + return 'nccl' + elif 'xpu' in device: + return 'bkcl' + elif 'cpu' in device: + return 'gloo' else: raise RuntimeError( "`paddle.distributed.spawn` does not support parallel training on device `{}` now.". @@ -112,6 +132,16 @@ def _get_node_ip(ips): def _get_subprocess_env_list(nprocs, options): + # NOTE (xiongkun03) Why put backend deduction here ? + # Becase _get_subprocess_env_list is used by many testcases. + # So for campability, we put backend deduction here + + # logic for handle backend option + if 'backend' not in options or options['backend'] == 'auto': + options['backend'] = _get_default_backend() + check_backend(options['backend']) + block_windows_and_macos(options['backend']) + # contruct processes env list processes_env_list = [] @@ -133,7 +163,7 @@ def _get_subprocess_env_list(nprocs, options): # if we set FLAGS_selected_gpus or FLAGS_selected_xpus to be `0,1,2,3`, it may cause error # when using `ParallelEnv` # NOTE(chenweihang): use absolute gpu or xpu card id - if core.is_compiled_with_cuda(): + if options['backend'] == 'nccl': args.selected_devices = options.get('gpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) @@ -168,7 +198,7 @@ def _get_subprocess_env_list(nprocs, options): "CUDA_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) - elif core.is_compiled_with_xpu(): + elif options['backend'] == 'bkcl': args.selected_devices = options.get('xpus', None) if args.selected_devices is None: args.selected_devices = options.get('selected_devices', None) @@ -202,6 +232,23 @@ def _get_subprocess_env_list(nprocs, options): raise ValueError("The selected xpu card %s cannot found in " "XPU_VISIBLE_DEVICES (%s)." % (card_id, ",".join(env_devices_list))) + elif options['backend'] == 'gloo': + # TODO check gpu / xpu flag must not exist + warnings.warn( + "Your model will be trained under CPUONLY mode by using GLOO," + "because CPUPlace is specified manually or your installed PaddlePaddle only support CPU Device." + ) + args.paddle_cpuonly = True + args.selected_devices = None + args.ips = args.cluster_node_ips + assert options.get( + 'use_paddlecloud', + None) is None, "CPUONLY spawn doesn't support use paddle cloud" + assert len( + args.cluster_node_ips.split(',') + ) <= 1, "CPUONLY spawn only support single trainer, that is len(ips)=1, but got %s." + assert _get_trainers_num( + ) == 1, "CPUONLY spawn doesn't support multi-trainer" # set other inner args args.node_ip = options.get('node_ip', None) @@ -215,11 +262,17 @@ def _get_subprocess_env_list(nprocs, options): args.use_paddlecloud = use_paddlecloud() # get cluster and pod config - cluster, pod = get_cluster_and_pod(args) + if options['backend'] == 'gloo': + devices_per_proc = [x for x in range(0, nprocs)] + cluster, pod = get_cluster_from_args(args, DeviceMode.CPU, + devices_per_proc) + else: + cluster, pod = get_cluster_and_pod(args) # prepare subprocess env list for trainer in pod.trainers: - processes_env_list.append(_prepare_trainer_env(cluster, trainer)) + processes_env_list.append( + _prepare_trainer_env(cluster, trainer, options['backend'])) # [Debug] print config args.print_config = options.get('print_config', False) @@ -236,27 +289,35 @@ def _remove_risky_env(): os.environ.pop("https_proxy", None) -def _set_trainer_env(env_dict): +def _set_trainer_env(env_dict, backend): # NOTE(chenweihang): [ Why need set FLAGS_selected_gpus or FLAGS_selected_xpus here? ] # When the child process starts, it will inherit the configuration of the # main process and set the FLAGS once, but the environment variable has # not been set at this time, which leads to the FLAGS_selected_gpus or FLAGS_selected_xpus # is keep same with mainprocess(usually empty), so manually update the flags here - if core.is_compiled_with_cuda(): + + # NOTE(xiongkun): why put backend here? because if gloo, we shouldn't set FLAGS_selectedXXX + # + + if backend == 'nccl': set_flags({'FLAGS_selected_gpus': env_dict['FLAGS_selected_gpus']}) - elif core.is_compiled_with_xpu(): + elif backend == 'bkcl': set_flags({'FLAGS_selected_xpus': env_dict['FLAGS_selected_xpus']}) else: - raise ValueError("PaddlePaddle should be compiled with XPU or CUDA.") + #NOTE(xiongkun) why not raise Error ? + # So far, we added support for CPU parallel, and will be applied when paddle is not + # compiled with cuda or xp. just do nothing. + pass + for var_name in env_dict: os.environ[var_name] = env_dict[var_name] -def _func_wrapper(func, args, error_queue, return_queue, env_dict): +def _func_wrapper(func, args, error_queue, return_queue, env_dict, backend): try: # config subprocess environment variables _remove_risky_env() - _set_trainer_env(env_dict) + _set_trainer_env(env_dict, backend) # execute function result = func(*args) # record function return value @@ -487,7 +548,8 @@ def train(print_result=False): return_queue = mp.SimpleQueue() process = mp.Process( target=_func_wrapper, - args=(func, args, error_queue, return_queue, procs_env_list[i])) + args=(func, args, error_queue, return_queue, procs_env_list[i], + options['backend'])) process.daemon = daemon process.start() error_queues.append(error_queue) diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 31d5748ce392e..1c27a0018fc02 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -25,6 +25,7 @@ from contextlib import closing import socket from paddle.fluid import core +from paddle.distributed.fleet.launch_utils import get_backend_by_compile_flag from distutils.util import strtobool from paddle.fluid.layer_helper import LayerHelper @@ -613,8 +614,10 @@ def __free_port(): return None -def _prepare_trainer_env(cluster, trainer): - if core.is_compiled_with_xpu(): +def _prepare_trainer_env(cluster, trainer, backend=None): + if backend is None: + backend = get_backend_by_compile_flag() # for compatibility + if backend == 'bkcl': proc_env = { "FLAGS_selected_xpus": "%s" % ",".join([str(g) for g in trainer.gpus]), @@ -623,7 +626,7 @@ def _prepare_trainer_env(cluster, trainer): "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) } - elif core.is_compiled_with_cuda(): + elif backend == 'nccl': proc_env = { "FLAGS_selected_gpus": "%s" % ",".join([str(g) for g in trainer.gpus]), @@ -632,6 +635,19 @@ def _prepare_trainer_env(cluster, trainer): "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) } + elif backend == 'gloo': + # NOTE (xiongkun) default fall back into cpu only + proc_env = { + "PADDLE_TRAINER_ID": "%d" % trainer.rank, + "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint, + "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), + "PADDLE_DISTRI_BACKEND": + backend, # only add here, other will be auto + } + else: + raise ValueError("backend must be one of 'gloo, nccl, bkcl'") + return proc_env diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index ac7471f8edfa4..1c9ce2bef5e17 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -200,8 +200,14 @@ endif() list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel) +LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer_gloo) # NOTE: @xiongkun03, cpu is too slow, fix it in next PR + if (NOT WITH_GLOO) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel_cpuonly) + + LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_unused_variables_gloo) + LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height_gloo) + LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_gloo) endif() if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) @@ -491,6 +497,10 @@ if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset) endif() +if (NOT WITH_GLOO) + LIST(REMOVE_ITEM TEST_OPS test_cpuonly_spawn) +endif() + if(NOT WITH_GPU OR WIN32 OR APPLE) list(REMOVE_ITEM TEST_OPS test_build_strategy_fusion_group_pass) endif() @@ -654,6 +664,9 @@ if(WITH_DISTRIBUTE) endforeach(TEST_OP) # solve it later. bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) + if (WITH_GLOO) + bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) + endif() bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) endif(NOT APPLE) endif() @@ -1070,3 +1083,8 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120) set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400) set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000) set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY") +if (WITH_GLOO) + set_tests_properties(test_parallel_dygraph_unused_variables_gloo PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_sparse_embedding_gloo PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_sparse_embedding_over_height_gloo PROPERTIES TIMEOUT 120) +endif() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py index 048c9b399d804..781d606f33b8f 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py @@ -66,8 +66,7 @@ def forward(self, x): class TestDistTraning(unittest.TestCase): def test_multiple_gpus(self): - backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') - dist.init_parallel_env(backend) + dist.init_parallel_env() self.trainer_id = dist.get_rank() model_a = SimpleNet(self.trainer_id) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py index 4ce67676c3e85..0387de32c9145 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py @@ -324,6 +324,7 @@ def run_one_loop(self, model, opt, data): bs = len(data) dy_x_data = np.array([x[0].reshape(3, 224, 224) for x in data]).astype('float32') + dy_x_data = dy_x_data / 255.0 y_data = np.array([x[1] for x in data]).astype('int64').reshape(bs, 1) img = to_variable(dy_x_data) label = to_variable(y_data) diff --git a/python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh b/python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh new file mode 100644 index 0000000000000..1c35166cf4434 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cpuonly_launch.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +function test_launch_cpuonly(){ + python -m paddle.distributed.launch --nproc_per_node=4 --backend=gloo \ + parallel_dygraph_gradient_check.py 2>ut.elog + if grep -q "ABORT" ut.elog; then + echo "test cpu only failed" + exit -1 + else + if grep -q "CPUONLY" ut.elog; then + echo "test_launch_cpuonly successfully" + else + echo "test_launch_cpuonly failed" + exit -1 + fi + fi +} +function test_launch_error_case1(){ + python -m paddle.distributed.launch --nproc_per_node=4 --backend=random_str \ + parallel_dygraph_gradient_check.py 2>ut.elog + if grep -q "ValueError" ut.elog; then + echo "test_launch_error_case1 successfully" + else + exit -1 + fi +} + +test_launch_cpuonly +test_launch_error_case1 diff --git a/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py b/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py new file mode 100644 index 0000000000000..1def2ffd82ad7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cpuonly_spawn.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +import paddle +import paddle.nn as nn +import paddle.optimizer as opt +import paddle.distributed as dist + + +class LinearNet(nn.Layer): + def __init__(self): + super(LinearNet, self).__init__() + self._linear1 = nn.Linear(10, 10) + self._linear2 = nn.Linear(10, 1) + + def forward(self, x): + return self._linear2(self._linear1(x)) + + +def train(print_result=False): + # 1. initialize parallel environment + dist.init_parallel_env() + + # 2. create data parallel layer & optimizer + layer = LinearNet() + dp_layer = paddle.DataParallel(layer) + + loss_fn = nn.MSELoss() + adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) + + # 3. run layer + inputs = paddle.randn([10, 10], 'float32') + outputs = dp_layer(inputs) + labels = paddle.randn([10, 1], 'float32') + loss = loss_fn(outputs, labels) + + if print_result is True: + print("loss:", loss.numpy()) + + loss.backward() + print("Grad is", layer._linear1.weight.grad) + adam.step() + adam.clear_grad() + + +class TestSpawn(unittest.TestCase): + def test_spawn(self): + dist.spawn(train, backend='gloo', nprocs=4) + + def test_wrong_backend(self): + try: + dist.spawn(train, backend='something', nprocs=4) + except ValueError as e: + self.assertEqual(type(e), ValueError) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index eceb484a0184c..63985415c51f6 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -209,7 +209,11 @@ def run_use_fleet_api_20_trainer(self, args): def get_data(): origin_batch = next(reader_generator) - if args.update_method != "local" and args.use_reader_alloc: + if paddle.distributed.get_world_size( + ) == 1 and args.update_method == 'gloo': # Gloo single mode + return origin_batch + + elif args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: @@ -506,7 +510,10 @@ def run_one_loop(self, model, opt, data): "train_one_loop should be implemented by the child classes.") def _get_data(self, batch, args): - if args.update_method != "local": + if paddle.distributed.get_world_size( + ) == 1 and args.update_method == 'gloo': # Gloo single mode + return batch + elif args.update_method != "local": new_batch = [] for offset, item in enumerate(batch): if offset % 2 == args.trainer_id: @@ -518,14 +525,16 @@ def _get_data(self, batch, args): def run_trainer(self, args): seed = 90 - if fluid.core.is_compiled_with_cuda(): + if args.update_method == 'gloo': + place = fluid.CPUPlace() + elif fluid.core.is_compiled_with_cuda(): device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) elif fluid.core.is_compiled_with_xpu(): device_id = int(os.getenv("FLAGS_selected_xpus", "0")) place = fluid.XPUPlace(device_id) else: - assert ("Only support CUDAPlace or XPUPlace for now.") + assert ("Only support CUDAPlace or XPUPlace or CPU(Gloo) for now.") with fluid.dygraph.guard(place): fluid.default_startup_program().random_seed = seed @@ -554,6 +563,16 @@ def run_trainer(self, args): model = dygraph.parallel.DataParallel( model, strategy, find_unused_parameters=True) print_to_err(type(self).__name__, "model built in dygraph") + + elif args.update_method == "gloo": + paddle.distributed.init_parallel_env() + if not args.find_unused_parameters: + model = dygraph.parallel.DataParallel( + model, find_unused_parameters=False) + else: + model = dygraph.parallel.DataParallel( + model, find_unused_parameters=True) + out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") for step_id, data in enumerate(train_reader()): @@ -588,12 +607,12 @@ def run_trainer_with_spawn(self, args): args.trainer_id = paddle.distributed.get_rank() # 3. init parallel env - if args.update_method == "nccl2": + if args.update_method in ["nccl2", "gloo"]: paddle.distributed.init_parallel_env() # 4. train model model, train_reader, opt = self.get_model() - if args.update_method == "nccl2": + if args.update_method in ["nccl2", "gloo"]: if args.find_unused_parameters: model = paddle.DataParallel(model, find_unused_parameters=True) else: @@ -668,7 +687,9 @@ def runtime_main(test_class): '--update_method', type=str, default="local", - choices=["pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer"]) + choices=[ + "pserver", "nccl2", "bkcl", "local", "nccl2_reduce_layer", "gloo" + ]) parser.add_argument('--trainer_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--nccl_comm_num', type=int, required=False, default=1) @@ -685,6 +706,7 @@ def runtime_main(test_class): '--current_endpoint', type=str, required=False, default="") parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--use_cuda', action='store_true') + parser.add_argument('--use_cpu', action='store_true') parser.add_argument('--use_xpu', action='store_true') parser.add_argument('--use_dgc', action='store_true') parser.add_argument('--accumulate_gradient', action='store_true') @@ -713,6 +735,9 @@ def runtime_main(test_class): args = parser.parse_args() + if args.update_method == 'gloo': + paddle.set_device("cpu") + model = test_class() if args.role == "pserver" and args.update_method == "pserver": model.run_pserver(args) @@ -770,6 +795,7 @@ def setUp(self): self._use_reader_alloc = True self._nccl2_mode = False self._bkcl_mode = False + self._gloo_mode = False # now, support gloo backend self._pipeline_mode = False self._mp_mode = False # FIXME(typhoonzero): I added this stupid argument to enable @@ -875,7 +901,7 @@ def _run_local(self, batch_size=DEFAULT_BATCH_SIZE, batch_merge_repeat=1, log_name="", - devices="0"): + devices="1"): cmd = self._python_interp @@ -947,6 +973,21 @@ def _run_local(self, return pickle.loads(local_out) + def _run_local_gloo(self, + model, + envs, + check_error_log=False, + batch_size=DEFAULT_BATCH_SIZE, + batch_merge_repeat=1, + log_name="", + devices="0"): + saved_endpoints = self._ps_endpoints + self._ps_endpoints = self._ps_endpoints.split(',')[0] + result = self._run_cluster_gloo(model, envs, 'gloo', check_error_log, + log_name) + self._ps_endpoints = saved_endpoints + return result + def _run_cluster(self, model, envs, check_error_log, log_name): # Run dist train to compare with local results ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver( @@ -1037,6 +1078,62 @@ def _run_cluster(self, model, envs, check_error_log, log_name): return pickle.loads(tr0_out), pickle.loads(tr1_out) + def _get_gloo_trainer_cmd(self, model, ep, update_method, trainer_id, + trainer_num): + env = {} + tr_cmd = "%s -u" + + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + tr_cmd += " -m coverage run --branch -p" + + tr_cmd += " %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method %s --lr %f" + + tr_cmd = tr_cmd % \ + (self._python_interp, model, self._ps_endpoints, + trainer_id, ep, update_method, self._lr) + + if self._use_reduce: + tr_cmd += " --use_reduce" + if self._use_reader_alloc: + tr_cmd += " --use_reader_alloc" + #assert self._use_reduce == False, "gloo not support _use_reduce" + #assert self._use_reader_alloc == False, "gloo not support _use_reduce" + if self._save_model: + tr_cmd += " --save_model" + self.__use_cuda = False + self.__use_xpu = False + assert self.__use_cuda == False, "gloo not support use cuda" + assert self.__use_xpu == False, "gloo not support use xpu" + tr_cmd += " --use_cpu" + env.update({ + "PADDLE_TRAINERS_NUM": "{}".format(trainer_num), + "PADDLE_TRAINER_ID": "{}".format(trainer_id), + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": ep, + "PADDLE_CURRENT_ENDPOINT": ep, + "PADDLE_DISTRI_BACKEND": "gloo", + "GLOG_v": "2", + }) + + assert self._use_dgc == False, "gloo not support use dgc" + if self._accumulate_gradient: + tr_cmd += " --accumulate_gradient" + + if self._find_unused_parameters: + tr_cmd += " --find_unused_parameters" + + assert self._pipeline_mode == False, "gloo not support use pipeline" + + if self._enable_backward_deps: # build strategy, save it + tr_cmd += " --enable_backward_deps" + + if self._fuse_all_reduce is not None: + tr_cmd += " --fuse_all_reduce {}".format(self._fuse_all_reduce) + + assert self._use_fleet_api == False, "gloo not support use fleet api" + assert self._use_fleet_api_20 == False, "gloo not support use fleet api" + return tr_cmd, env + def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id, trainer_num): env = {} @@ -1123,6 +1220,57 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id, return tr_cmd, env + def _run_cluster_gloo(self, model, envs, update_method, check_error_log, + log_name): + assert update_method == "gloo", "_run_cluster_gloo must have update_method: gloo, but get %s" % update_method + assert not self._use_hallreduce, "_run_cluster_gloo must have _use_hallreduce = false" + + worker_endpoints = self._ps_endpoints.split(",") + + trainer_num = len(worker_endpoints) + + procs = [] + pipes = [] + for i in range(0, trainer_num): + tr_cmd, tr_env = self._get_gloo_trainer_cmd( + model, worker_endpoints[i], update_method, i, trainer_num) + tr_env.update(envs) + tr_env["GLOG_vmodule"] = 'gloo_context=4' + tr_env["GLOG_v"] = '3' + print("use_hallreduce:{} tr_cmd:{}, env: {}".format( + self._use_hallreduce, tr_cmd, tr_env)) + + tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb") + + print_to_err( + type(self).__name__, + "going to start process {} with nccl2".format(i)) + tr_proc = subprocess.Popen( + tr_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=tr_pipe, + env=tr_env) + + procs.append(tr_proc) + pipes.append(tr_pipe) + + outs = [] + for i in range(0, trainer_num): + tr_out, tr_err = procs[i].communicate() + outs.append(tr_out) + pipes[i].close() + sys.stderr.write('trainer {} stderr: {}\n'.format(i, tr_err)) + + if trainer_num == 1: + if check_error_log: print("outs[0]:", outs[0]) + return pickle.loads(outs[0]) + + else: + if check_error_log: + print("outs[0]:", outs[0]) + print("outs[1]:", outs[1]) + return pickle.loads(outs[0]), pickle.loads(outs[1]) + def _run_cluster_nccl2(self, model, envs, update_method, check_error_log, log_name): if self._use_hallreduce: @@ -1262,7 +1410,12 @@ def check_with_place(self, required_envs = self._get_required_envs(check_error_log, need_envs) - local_losses \ + if self._gloo_mode: + local_losses \ + = self._run_local_gloo(model_file, required_envs, + check_error_log, log_name=log_name) + else: + local_losses \ = self._run_local(model_file, required_envs, check_error_log, log_name=log_name) @@ -1288,6 +1441,14 @@ def check_with_place(self, update_method='bkcl', check_error_log=check_error_log, log_name=log_name) + elif self._gloo_mode: + # gloo mode, cpu only parallel train @xiongkun03 + tr0_losses, tr1_losses = self._run_cluster_gloo( + model_file, + required_envs, + update_method='gloo', + check_error_log=check_error_log, + log_name=log_name) elif self._pipeline_mode: tr0_losses, tr1_losses = self._run_pipeline( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index c97cd56e8a7a4..edf9aed04f5e0 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -49,6 +49,51 @@ def get_gpus(selected_gpus): return selected_gpus +def start_local_trainers_cpu(trainer_endpoints, + training_script, + training_script_args, + log_dir=None): + current_env = copy.copy(os.environ.copy()) + current_env.pop("http_proxy", None) + current_env.pop("https_proxy", None) + + procs = [] + n_rank = len(trainer_endpoints) + print(trainer_endpoints) + for rank_id, endpoint in enumerate(trainer_endpoints): + proc_env = { + "PADDLE_DISTRI_BACKEND": "gloo", + "PADDLE_TRAINER_ID": "%d" % rank_id, + "PADDLE_CURRENT_ENDPOINT": "%s" % endpoint, + "PADDLE_TRAINERS_NUM": "%d" % n_rank, + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints) + } + + current_env.update(proc_env) + + print("trainer proc env:{}".format(current_env)) + + assert os.getenv('WITH_COVERAGE', + 'OFF') == 'OFF', "Gloo don't support WITH_COVERAGE." + cmd = "python -u " + training_script + + print("start trainer proc:{} env:{}".format(cmd, proc_env)) + + fn = None + + proc = subprocess.Popen(cmd.split(" "), env=current_env) + + tp = TrainerProc() + tp.proc = proc + tp.rank = rank_id + tp.log_fn = fn + tp.cmd = cmd + + procs.append(tp) + + return procs + + def start_local_trainers(cluster, pod, training_script, @@ -116,6 +161,26 @@ def run_mnist_2gpu(self, target_file_name): training_script=target_file_name, training_script_args=[]) + while True: + alive = watch_local_trainers(procs, cluster.trainers_endpoints()) + + if not alive: + print("Local procs complete, POD info:{}".format(pod)) + break + time.sleep(3) + + +class TestMultipleWithGloo(unittest.TestCase): + def run_mnist_2cpu(self, target_file_name): + + cluster, pod = get_cluster_from_args( + [0, 1]) #tmp use. for getting trainer_nranks() + + procs = start_local_trainers_cpu( + cluster.trainers_endpoints(), + training_script=target_file_name, + training_script_args=[]) + while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py new file mode 100644 index 0000000000000..56fcf806c4717 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_gloo.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_sparse_embedding import TestSparseEmbedding +from parallel_dygraph_sparse_embedding_fp64 import TestSparseEmbeddingFP64 + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphSparseEmdedding_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_sparse_embedding(self): + self.check_with_place( + "parallel_dygraph_sparse_embedding.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphSparseEmdeddingFP64_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_sparse_embedding_fp64(self): + self.check_with_place( + "parallel_dygraph_sparse_embedding_fp64.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py new file mode 100644 index 0000000000000..ba43e26e23a4e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height_gloo.py @@ -0,0 +1,44 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_sparse_embedding_over_height import TestSparseEmbeddingOverHeight + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphSparseEmdeddingOverHeight_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_sparse_embedding(self): + self.check_with_place( + "parallel_dygraph_sparse_embedding_over_height.py", + delta=1e-7, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py new file mode 100644 index 0000000000000..d3619cc1b9a00 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer_gloo.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_transformer import TestTransformer + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphTransformer_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_transformer(self): + self.check_with_place( + "parallel_dygraph_transformer.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphTransformerAccGrad_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + self._accumulate_gradient = True + self._find_unused_parameters = False + + def test_transformer(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_transformer.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py new file mode 100644 index 0000000000000..89373fcb6eebc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables_gloo.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner +from parallel_dygraph_unused_variables import TestSparseEmbeddingUnusedVars + +flag_name = os.path.splitext(__file__)[0] + + +class TestParallelDygraphUnusedVar_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_net(self): + self.check_with_place( + "parallel_dygraph_unused_variables.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphNoVar_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_net(self): + self.check_with_place( + "parallel_dygraph_none_var.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphSharedUnusedVariables_GLOO(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._gloo_mode = True + self._dygraph = True + + def test_mnist(self): + self.check_with_place( + "parallel_dygraph_shared_unused_var.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py index 14547eca5aca2..dccc117f6bc15 100644 --- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py +++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py @@ -24,6 +24,7 @@ from paddle.fluid import core from paddle.fluid.dygraph import parallel_helper +import multiprocessing # NOTE(chenweihang): Coverage CI is currently not able to count python3 # unittest, so the unittests here covers some cases that will only be @@ -89,8 +90,8 @@ def test_options_valid_check(self): def test_get_default_nprocs(self): paddle.set_device('cpu') - with self.assertRaises(RuntimeError): - nprocs = _get_default_nprocs() + nprocs = _get_default_nprocs() + self.assertEqual(nprocs, multiprocessing.cpu_count()) paddle.set_device('gpu') nprocs = _get_default_nprocs() From 7bf2aa3883066cb880e4bca8f8691dcdaf470c51 Mon Sep 17 00:00:00 2001 From: TTerror Date: Thu, 21 Oct 2021 14:28:24 +0800 Subject: [PATCH 060/116] add fill_any_like/flatten ops to train ssd on kunlun (#36550) * add some ops to train ssd on kunlun * update test_fill_any_like_op_xpu.py --- .../fluid/operators/fill_any_like_op_xpu.cc | 79 +++++ paddle/fluid/operators/flatten_op_xpu.cc | 67 ++++ paddle/fluid/platform/xpu/xpu2_op_list.h | 36 ++ .../fluid/tests/unittests/op_test_xpu.py | 24 +- .../xpu/test_fill_any_like_op_xpu.py | 77 +++++ .../unittests/xpu/test_flatten2_op_xpu.py | 83 +++++ .../test_flatten_contiguous_range_op_xpu.py | 320 ++++++++++++++++++ .../unittests/xpu/test_flatten_op_xpu.py | 77 +++++ 8 files changed, 761 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/operators/fill_any_like_op_xpu.cc create mode 100644 paddle/fluid/operators/flatten_op_xpu.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc new file mode 100644 index 0000000000000..76cf339fbf5cc --- /dev/null +++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/fill_any_like_op.h" + +namespace paddle { +namespace operators { + +template +class FillAnyLikeXPUKernel : public framework::OpKernel { + public: + using CommonType = typename std::common_type< + float, + typename std::conditional::value, + float, T>::type>::type; + using XPUInTDType = typename XPUTypeTrait::Type; + + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + float value = context.Attr("value"); + + auto common_type_value = static_cast(value); + + PADDLE_ENFORCE_EQ( + (common_type_value >= + static_cast(std::numeric_limits::lowest())) && + (common_type_value <= + static_cast(std::numeric_limits::max())), + true, + platform::errors::InvalidArgument( + "The filled value is out of range for target type, " + "current kernel type is %s, the range should between %f " + "and %f, but now value is %f.", + typeid(T).name(), + static_cast(std::numeric_limits::lowest()), + static_cast(std::numeric_limits::max()), value)); + + PADDLE_ENFORCE_EQ( + std::isnan(value), false, + platform::errors::InvalidArgument("The filled value is NaN.")); + + auto& dev_ctx = + context.template device_context(); + auto out_data = reinterpret_cast(out->data()); + int ret = xpu::constant(dev_ctx.x_context(), out_data, out->numel(), + static_cast(value)); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU CONSTANT API return wrong value[%d %s].", ret, + XPUAPIErrorMsg[ret])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL(fill_any_like, ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel); + +#endif diff --git a/paddle/fluid/operators/flatten_op_xpu.cc b/paddle/fluid/operators/flatten_op_xpu.cc new file mode 100644 index 0000000000000..53c0c688fd9e9 --- /dev/null +++ b/paddle/fluid/operators/flatten_op_xpu.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/flatten_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL( + flatten, ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_XPU_KERNEL( + flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_XPU_KERNEL( + flatten2, ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_XPU_KERNEL( + flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); +REGISTER_OP_XPU_KERNEL( + flatten_contiguous_range, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel); +REGISTER_OP_XPU_KERNEL( + flatten_contiguous_range_grad, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel); +#endif diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h index 5d45e5d9d5050..0a9a9453b53e3 100644 --- a/paddle/fluid/platform/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/xpu/xpu2_op_list.h @@ -119,6 +119,42 @@ XPUOpMap& get_kl2_ops() { {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, + {"fill_any_like", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten_grad", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten2", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten2_grad", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + + {"flatten_contiguous_range", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"flatten_contiguous_range_grad", + XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, // AddMore }; diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py index 133367a5f3625..239708cc17449 100644 --- a/python/paddle/fluid/tests/unittests/op_test_xpu.py +++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py @@ -91,11 +91,31 @@ def is_mkldnn_op_test(): # case in NO_FP64_CHECK_GRAD_CASES and op in NO_FP64_CHECK_GRAD_OP_LIST should be fixed if not hasattr(cls, "no_need_check_grad") \ and not is_empty_grad_op(cls.op_type): - if cls.dtype is not None and \ - cls.dtype != np.float32: + if cls.dtype is None or \ + (cls.dtype == np.float16 \ + and cls.op_type not in op_accuracy_white_list.NO_FP16_CHECK_GRAD_OP_LIST \ + and not hasattr(cls, "exist_check_grad")): raise AssertionError("This test of %s op needs check_grad." % cls.op_type) + # check for op test with fp64 precision, but not check mkldnn op test for now + if cls.dtype in [np.float32, np.float64] \ + and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \ + and not hasattr(cls, 'exist_fp64_check_grad') \ + and not is_xpu_op_test() \ + and not is_mkldnn_op_test() \ + and not is_rocm_op_test() \ + and not is_npu_op_test(): + raise AssertionError( + "This test of %s op needs check_grad with fp64 precision." % + cls.op_type) + + if not cls.input_shape_is_large \ + and cls.op_type not in check_shape_white_list.NEED_TO_FIX_OP_LIST: + raise AssertionError( + "Input's shape should be large than or equal to 100 for " + + cls.op_type + " Op.") + def try_call_once(self, data_type): if not self.call_once: self.call_once = True diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py new file mode 100644 index 0000000000000..27c101b20f684 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard +import paddle.compat as cpt +import unittest +import numpy as np +from op_test import OpTest +from op_test_xpu import XPUOpTest + +paddle.enable_static() + + +class TestFillAnyLikeOp(OpTest): + def setUp(self): + self.op_type = "fill_any_like" + self.dtype = np.float32 + self.use_xpu = True + self.use_mkldnn = False + self.value = 0.0 + self.init() + self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)} + self.attrs = {'value': self.value, 'use_xpu': True} + self.outputs = {'Out': self.value * np.ones_like(self.inputs["X"])} + + def init(self): + pass + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + +class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp): + def init(self): + self.dtype = np.float32 + self.value = 0.0 + + +class TestFillAnyLikeOpValue1(TestFillAnyLikeOp): + def init(self): + self.value = 1.0 + + +class TestFillAnyLikeOpValue2(TestFillAnyLikeOp): + def init(self): + self.value = 1e-9 + + +class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp): + def init(self): + self.dtype = np.float16 + self.value = 0.05 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py new file mode 100644 index 0000000000000..9cbc83950d1e8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py @@ -0,0 +1,83 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test import OpTest +from op_test_xpu import XPUOpTest +paddle.enable_static() + + +class TestFlatten2Op(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = "flatten2" + self.place = paddle.XPUPlace(0) + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } + + def set_xpu(self): + self.__class__.use_xpu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 4, 5) + self.axis = 1 + self.new_shape = (3, 40) + + def init_attrs(self): + self.attrs = {"axis": self.axis} + + +class TestFlatten2OpWithCornerAxis(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.axis = 0 + self.new_shape = (1, 120) + + +class TestFlatten2OpWithDefaultAxis(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (10, 2, 2, 3) + self.new_shape = (10, 12) + + def init_attrs(self): + self.attrs = {} + + +class TestFlatten2OpSixDims(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py new file mode 100644 index 0000000000000..dcad3c479f446 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py @@ -0,0 +1,320 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +from op_test_xpu import XPUOpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +class TestFlattenOp(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = "flatten_contiguous_range" + self.place = paddle.XPUPlace(0) + self.use_xpu = True + self.use_mkldnn = False + + self.start_axis = 0 + self.stop_axis = -1 + self.dtype = np.float32 + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype(self.dtype)} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } + + def set_xpu(self): + self.__class__.use_xpu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = -1 + self.new_shape = (120) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis, + 'use_xpu': True, + } + + +class TestFlattenOp_1(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 1 + self.stop_axis = 2 + self.new_shape = (3, 10, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_2(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_3(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 2 + self.new_shape = (30, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_4(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = -2 + self.stop_axis = -1 + self.new_shape = (3, 2, 20) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_5(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 2 + self.stop_axis = 2 + self.new_shape = (3, 2, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.start_axis = 3 + self.stop_axis = 5 + self.new_shape = (3, 2, 3, 32) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_Float32(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.float32 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_int32(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int32 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis, + 'use_xpu': True + } + + def test_check_grad(self): + pass + + +class TestFlattenOp_int8(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int8 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + def test_check_grad(self): + pass + + +class TestFlattenOp_int64(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int64 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + def test_check_grad(self): + pass + + +class TestFlatten2OpError(unittest.TestCase): + def test_errors(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_ValueError1(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + out = paddle.flatten(x_var, start_axis=2, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError1) + + def test_ValueError2(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=10, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError2) + + def test_ValueError3(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=2, stop_axis=10) + + self.assertRaises(ValueError, test_ValueError3) + + def test_type(): + # dtype must be float32, float64, int8, int32, int64 + x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x2 = x2.astype('float16') + x2_var = paddle.fluid.data( + name='x2', shape=[3, 2, 4, 5], dtype='float16') + paddle.flatten(x2_var) + + self.assertRaises(TypeError, test_type) + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + +class TestStaticFlattenPythonAPI(unittest.TestCase): + def execute_api(self, x, start_axis=0, stop_axis=-1): + return paddle.flatten(x, start_axis, stop_axis) + + def test_static_api(self): + paddle.enable_static() + np_x = np.random.rand(2, 3, 4, 4).astype('float32') + + main_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, paddle.static.Program()): + x = paddle.static.data( + name="x", shape=[2, 3, 4, 4], dtype='float32') + out = self.execute_api(x, start_axis=-2, stop_axis=-1) + + exe = paddle.static.Executor(place=paddle.XPUPlace(0)) + fetch_out = exe.run(main_prog, feed={"x": np_x}, fetch_list=[out]) + self.assertTrue((2, 3, 16) == fetch_out[0].shape) + + +class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI): + def execute_api(self, x, start_axis=0, stop_axis=-1): + return x.flatten_(start_axis, stop_axis) + + +class TestFlattenPython(unittest.TestCase): + def test_python_api(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + def test_Negative(): + paddle.disable_static(paddle.XPUPlace(0)) + img = paddle.to_tensor(x) + out = paddle.flatten(img, start_axis=-2, stop_axis=-1) + return out.numpy().shape + + res_shape = test_Negative() + self.assertTrue((2, 3, 16) == res_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py new file mode 100644 index 0000000000000..ed435198353ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test import OpTest +from op_test_xpu import XPUOpTest +paddle.enable_static() + + +class TestFlattenOp(XPUOpTest): + def setUp(self): + self.op_type = "flatten" + self.use_xpu = True + self.place = paddle.XPUPlace(0) + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 1 + self.new_shape = (3, 40) + + def init_attrs(self): + self.attrs = {"axis": self.axis} + + +class TestFlattenOp1(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 0 + self.new_shape = (1, 120) + + +class TestFlattenOpWithDefaultAxis(TestFlattenOp): + def init_test_case(self): + self.in_shape = (10, 2, 2, 3) + self.new_shape = (10, 12) + + def init_attrs(self): + self.attrs = {} + + +class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +if __name__ == "__main__": + unittest.main() From 66f4b29220b1417ba65f25d9636eba84d280cc13 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Thu, 21 Oct 2021 15:23:17 +0800 Subject: [PATCH 061/116] fix hdfs download_dir (#36590) --- python/paddle/distributed/fleet/utils/fs.py | 4 ++-- python/paddle/fluid/tests/unittests/hdfs_test_utils.py | 2 +- python/paddle/fluid/tests/unittests/test_hdfs3.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index f56580f8ca2fe..8895a529526f7 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -842,8 +842,8 @@ def __subprocess_download(local_path, datas): if self.is_file(fs_path): return self._try_download(fs_path, local_path) # download dir - _, all_files = self.ls_dir(fs_path) - + _, all_filenames = self.ls_dir(fs_path) + all_files = [fs_path + i for i in all_filenames] procs = [] for i in range(multi_processes): process_datas = self._split_files(all_files, i, multi_processes) diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py index 6b49049073948..69ccc7088b834 100644 --- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py +++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py @@ -195,7 +195,7 @@ def _test_download_dir(self, fs): fs.download(src_file, dst_file) local = LocalFS() - self.assertTrue(local.is_exist(dst_file)) + self.assertTrue(local.is_exist(file1)) local.delete(dst_file) fs.delete(src_file) diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py index d214768b2e32f..57b0b1ba45f24 100644 --- a/python/paddle/fluid/tests/unittests/test_hdfs3.py +++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py @@ -40,6 +40,7 @@ def test_hdfs(self): self._test_upload(fs) self._test_upload_dir(fs) self._test_download(fs) + self._test_download_dir(fs) def test_local(self): fs = LocalFS() From 6072aecba10908241f8883a005d2fc12c2a24352 Mon Sep 17 00:00:00 2001 From: Jack Zhou Date: Thu, 21 Oct 2021 16:05:53 +0800 Subject: [PATCH 062/116] Add viterbi decode (#35778) * add viterbi decode cpu kernel * add viterbi decoder api in paddle.text * add a data buffer once to avoid create many small pieces of data buffer frequently * fix viterbi max_seq_length bug * fix seq_len=1 bug * fix device context * move split out of for loop * remove INVERSE_SUB * remove 2 GET_CAST_MASK * remove 1 loop * remove Functor * add to_static deploy code * use MAX_FUNC instead of ELE_MAX * add MaxFunctor * impl max_func * remove MaxFunctor * remove cast op * use REGISTER_OP_WITHOUT_GRADIENT * add viterbi cuda kernel * add FIX_BLOCKDIM_CASE macro * add MKL add, mul; add get data mask * add arange mkl impl * add CPU Argmax * add cpu gather * use EXECUTE_MKL_ELEMENT_BINARY_OP instead of some ADD, MUL * use SameDimsBinaryOP instead of EXECUTE_MKL_ELEMENT_BINARY_OP * use SAME_DIMS_ELEMENT_BINARY_OP * add SimpleBroadcastBinaryOP * use int instead of int64_t to accelerate * optimize SimpleBroadcastBinaryOP * optimize SimpleBroadcastBinaryOP * optimize performance in both single thread and multithread situation * remove useless line * remove useless code * add CREATE_TENSOR_BUFFER macro * add INIT_REQUIRED_TENSOR macro * add comment * fix windows ci * add viterbi unittest * remove cuda add functor * remove cuda equal * remove a template function * fix windows ci * fix windows dtype * remove some template instance * remove useless header file * remove some blockdim * remove transpose impl * accelerate cpu performance on single thread situation * viterbi_decode->crf_decode * rename crf params name * add viterbi api test * remove useless import * add enable_static * use viterbi decoder * fix viterbi len=1 * fix viterbi unittest * remove useless comments * reconstruct viterbi decode * remove ADD,SUB,MUL structure * fix coverage * remove CREATE_TENSOR * add name args * crf.py->ops.py; with_start_stop_tag->include_start_end_tag * update crf_decode en docs * fix viterbi decode en docs * fix some review comments * add FIXED_BLOCK_DIM_CASE in cuda * push_back->emplace_back * crf_decode->viterbi_decode; include_start_end_tag->include_bos_eos_tag * paddle.text.ops.viterbi_decode->paddle.text.viterbi_decode * fix viterbi_decode en docs --- .../elementwise/elementwise_op_function.h | 4 +- paddle/fluid/operators/viterbi_decode_op.cc | 109 +++++ paddle/fluid/operators/viterbi_decode_op.cu | 200 +++++++++ paddle/fluid/operators/viterbi_decode_op.h | 415 ++++++++++++++++++ .../tests/unittests/test_viterbi_decode_op.py | 134 ++++++ python/paddle/text/__init__.py | 6 +- python/paddle/text/viterbi_decode.py | 132 ++++++ 7 files changed, 996 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/viterbi_decode_op.cc create mode 100644 paddle/fluid/operators/viterbi_decode_op.cu create mode 100644 paddle/fluid/operators/viterbi_decode_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_viterbi_decode_op.py create mode 100644 python/paddle/text/viterbi_decode.py diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 312978a010b30..2df7dd06f2cc8 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -240,7 +240,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, x_dims, y_dims, x_dims_array[i], y_dims_array[i], i)); if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) || (x_dims_array[i] == 1 && y_dims_array[i] == 1)) { - out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); + out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]); } else { out_dims_array[i] = -1; } @@ -1779,7 +1779,7 @@ void CommonElementwiseBroadcastForward( const framework::Tensor *y, framework::Tensor *z, const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func, int axis, const bool is_xsize_larger = true) { - int max_dim = std::max(x_dims.size(), y_dims.size()); + int max_dim = (std::max)(x_dims.size(), y_dims.size()); axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); PADDLE_ENFORCE_GE( axis, 0, diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc new file mode 100644 index 0000000000000..bf1cdeed65a84 --- /dev/null +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ViterbiDecodeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", + "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", + "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 3, + platform::errors::InvalidArgument( + "The rank of Input in ViterbiDecode must be 3. But " + "received Input's rank is %d.", + in_dims.size())); + auto length_dims = ctx->GetInputDim("Length"); + PADDLE_ENFORCE_EQ(length_dims.size(), 1, + platform::errors::InvalidArgument( + "The rank of Length in ViterbiDecode must be 1. But " + "received Length's rank is %d.", + length_dims.size())); + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ( + transition_dims.size(), 2, + platform::errors::InvalidArgument( + "The rank of Transition in ViterbiDecode must be 2. But " + "received Transition's rank is %d.", + transition_dims.size())); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + in_dims[0], length_dims[0], + platform::errors::InvalidArgument( + "The batch size of Input and Length should be equal.")); + PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], + platform::errors::InvalidArgument( + "The number of tags of Input (%d) and Transition " + "(%d) should be equal.", + transition_dims[0], in_dims[2])); + } + ctx->SetOutputDim("Scores", length_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Input"), + ctx.device_context()); + } +}; + +class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "The unary emission tensor. The shape of Input must be (batch_size," + "sequence_length, num_tags). "); + AddInput("Transition", + "The transition matrix. The shape of Transition must be ( " + "num_tags, num_tags). "); + AddInput("Length", + "The input length tensor storing real length of each sequence for " + "correctness. The shape of Length MUST be (batch_size)."); + AddOutput("Scores", + "The scores tensor containing the score for the Viterbi " + "sequence. The shape of Scores MUST be (batch_size)."); + AddOutput("Path", + "The paths tensor containing the highest scoring tag indices. " + "The shape of Scores MUST be (batch_size, sequence_length)."); + AddAttr("include_bos_eos_tag", + "If set to True, the last row and the last column of " + "transitions will be considered as start tag.") + .SetDefault(true); + AddComment(R"DOC( + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace platform = paddle::platform; +REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, + ops::ViterbiDecodeOpMaker); +REGISTER_OP_CPU_KERNEL( + viterbi_decode, ops::ViterbiDecodeKernel, + ops::ViterbiDecodeKernel); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu new file mode 100644 index 0000000000000..086ff05b08461 --- /dev/null +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -0,0 +1,200 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_functor.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/viterbi_decode_op.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +namespace paddle { +namespace operators { + +#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ + case (1 << (log2_block_dim)): { \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM_CASE(...) \ + FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); + +int64_t ComputeBlockSize(int64_t col) { + if (col > 512) + return 1024; + else if (col > 256) + return 512; + else if (col > 128) + return 256; + else if (col > 64) + return 128; + else if (col > 32) + return 64; + else if (col > 16) + return 32; + else if (col > 8) + return 16; + else + return 8; +} + +template