From eb28a36dea3998afababbd76803c7e2f8de7ea0b Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Fri, 24 Sep 2021 14:41:59 +0800 Subject: [PATCH 1/7] concat api support empty tensor. (#35845) --- python/paddle/fluid/layers/tensor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 3a0bbeb0fa341..abc84be12b38f 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -322,6 +322,8 @@ def concat(input, axis=0, name=None): if isinstance(axis, Variable): axis = axis.numpy() axis = axis.item(0) + if not isinstance(input, Variable): + input = [t for t in input if t.shape.count(0) == 0] return _C_ops.concat(input, 'axis', axis) check_type(input, 'input', (list, tuple, Variable), 'concat') From 4f42e5d77c8b1083f73389cc555ff59c288c0535 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Fri, 24 Sep 2021 15:36:10 +0800 Subject: [PATCH 2/7] fix undefined var in test_batch_sampler. test=develop (#35924) --- python/paddle/fluid/tests/unittests/test_batch_sampler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_batch_sampler.py b/python/paddle/fluid/tests/unittests/test_batch_sampler.py index 4c323a2511f5b..4c5338314afb1 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_sampler.py +++ b/python/paddle/fluid/tests/unittests/test_batch_sampler.py @@ -22,6 +22,8 @@ RandomSampler, WeightedRandomSampler from paddle.io import DistributedBatchSampler +IMAGE_SIZE = 32 + class RandomDataset(Dataset): def __init__(self, sample_num, class_num): @@ -31,7 +33,7 @@ def __init__(self, sample_num, class_num): def __getitem__(self, idx): np.random.seed(idx) image = np.random.random([IMAGE_SIZE]).astype('float32') - label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') + label = np.random.randint(0, self.class_num - 1, (1, )).astype('int64') return image, label def __len__(self): From 485b387d0db2b7f4592a2b13f9c68acba5fe7a60 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Fri, 24 Sep 2021 09:47:36 +0200 Subject: [PATCH 3/7] [oneDNN] candidate fix to #34554 (#35884) * - candidate fix * - More fixes to #34554 * - another incosnstent fix to key * - Remvoed unneeded line * - matching the cache behaviour to other ops --- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 21 +++---- paddle/fluid/platform/mkldnn_reuse.h | 56 ++++++++----------- 2 files changed, 32 insertions(+), 45 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 09386fc31ee31..1b69dd7ea00c7 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -706,7 +706,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::CreateKey(dev_ctx, src_tz, src_dt, ctx.InputName("Input") + ctx.InputName("Filter")); - const std::string key_conv_pd = key + "@conv_pd"; bool need_s8_to_u8 = false; std::shared_ptr conv_p; std::shared_ptr src_memory_p; @@ -721,6 +720,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { // are merged/unified, this will disappear auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); + const std::string key_conv_pd = key_tid + "@conv_pd"; auto prim_key = key_tid + "@conv_p"; auto dst_key = key_tid + "@dst_mem_p"; auto src_key = key_tid + "@src_mem_p"; @@ -731,12 +731,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_reorder_key = key_tid + "@src_mem_preorder_p"; auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p"; - conv_p = std::static_pointer_cast( - dev_ctx.GetBlob(prim_key)); + conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - if (conv_p == nullptr || !is_test) { + if (conv_pd == nullptr || !is_test) { float fuse_alpha = ctx.Attr("fuse_alpha"); float fuse_beta = ctx.Attr("fuse_beta"); bool force_fp32_output = ctx.Attr("force_fp32_output"); @@ -946,7 +947,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } // create convolution op primitive - auto scale_bias_key = key + "@scale_bias"; conv_p = handler->AcquireConvolution(); if (bias) { const K* bias_data = bias->data(); @@ -1000,13 +1000,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { dev_ctx.GetBlob(weights_key)); dst_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); - if (conv_pd) { - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, - mkldnn_engine, key)); - } + conv_p = std::static_pointer_cast( + dev_ctx.GetBlob(prim_key)); + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); if (fuse_residual_conn) { auto residual_param = ctx.Input("ResidualData"); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 29a3f8e9dcd3c..d6ab9e50a066e 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -603,7 +603,6 @@ class MKLDNNHandler { const std::string& base_key) : dev_ctx_(dev_ctx), engine_(engine), - key_common_(base_key), key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)) { platform::MKLDNNDeviceContext::tls().log_lib_version(); } @@ -789,7 +788,6 @@ class MKLDNNHandler { protected: const MKLDNNDeviceContext& dev_ctx_; mkldnn::engine engine_; - std::string key_common_; std::string key_; }; @@ -1371,42 +1369,34 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { // Conv PD has to be passed to Grad op that // may be exxecuted by diffrent thread, hence // for that one we use key that does not contain TID - const std::string key_conv_pd = key_common_ + "@conv_pd"; + const std::string key_conv_pd = key_ + "@conv_pd"; conv_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_conv_pd)); if (conv_pd_ == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - - conv_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_conv_pd)); - if (conv_pd_ == nullptr) { - mkldnn::memory::dims stride_dims = strides; - mkldnn::memory::dims dilations_dims = dilations; - auto mkldnn_paddings = ToMkldnnPadding(paddings); - - auto conv_desc = - bias ? typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, - src, weights, *bias, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]) - : typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, - src, weights, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]); - - mkldnn::primitive_attr conv_attr = - CreatePostOps(fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, output_shift_scale, sum_scale); - - conv_pd_.reset(new typename forward_t::primitive_desc( - conv_desc, conv_attr, engine)); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx_.SetBlob(key_conv_pd, conv_pd_); - } + mkldnn::memory::dims stride_dims = strides; + mkldnn::memory::dims dilations_dims = dilations; + auto mkldnn_paddings = ToMkldnnPadding(paddings); + + auto conv_desc = + bias ? typename forward_t::desc( + fwd_prop_kind, convolutional_algorithm::T, src, + weights, *bias, dst, stride_dims, dilations_dims, + mkldnn_paddings[0], mkldnn_paddings[1]) + : typename forward_t::desc( + fwd_prop_kind, convolutional_algorithm::T, src, + weights, dst, stride_dims, dilations_dims, + mkldnn_paddings[0], mkldnn_paddings[1]); + + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_activation, fuse_alpha, fuse_beta, + fuse_residual_conn, output_shift_scale, sum_scale); + + conv_pd_.reset( + new typename forward_t::primitive_desc(conv_desc, conv_attr, engine)); + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx_.SetBlob(key_conv_pd, conv_pd_); } return conv_pd_; From 82f255d0a5ab93b8996897cb8d6a28484694c91a Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Fri, 24 Sep 2021 03:14:07 -0500 Subject: [PATCH 4/7] add pool2d convert test (#35923) * add pool2d convert test * modify error * modify error * modify error * modify error * modify error * modify error --- .../inference/tensorrt/convert/pool2d_op.cc | 13 + paddle/fluid/inference/tensorrt/op_teller.cc | 20 ++ .../test_trt_convert_anchor_generator.py | 116 +++++++++ .../test_trt_convert_conv2d_transpose.py | 227 ++++++++++++++++++ .../test_trt_convert_depthwise_conv2d.py | 203 ++++++++++++++++ ..._trt_convert_depthwise_conv2d_transpose.py | 191 +++++++++++++++ .../ir/inference/test_trt_convert_pool2d.py | 148 ++++++++++++ 7 files changed, 918 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 90d6392fd6404..1898f28c73ad0 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -87,6 +87,10 @@ class Pool2dOpConverter : public OpConverter { bool adaptive = false; if (op_desc.HasAttr("adaptive")) adaptive = BOOST_GET_CONST(bool, op_desc.GetAttr("adaptive")); + std::string padding_algorithm = "EXPLICIT"; + if (op_desc.HasAttr("padding_algorithm")) + padding_algorithm = + BOOST_GET_CONST(std::string, op_desc.GetAttr("padding_algorithm")); nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX; nvinfer1::ReduceOperation reduce_operation = @@ -124,6 +128,9 @@ class Pool2dOpConverter : public OpConverter { pool_layer->setStride(nv_strides); pool_layer->setPadding(nv_paddings); pool_layer->setAverageCountExcludesPadding(exclusive); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } layer = pool_layer; } else if (global_pooling) { auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1, @@ -159,6 +166,9 @@ class Pool2dOpConverter : public OpConverter { auto output_name = op_desc.Output("Out")[0]; pool_layer->setStride(nv_strides); pool_layer->setPadding(nv_paddings); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } pool_layer->setAverageCountExcludesPadding(exclusive); pool_layer->setName(("pool2d (Output: " + output_name + ")").c_str()); pool_layer->getOutput(0)->setName(output_name.c_str()); @@ -198,6 +208,9 @@ class Pool2dOpConverter : public OpConverter { "trt pool layer in converter could not be created.")); pool_layer->setStride(nv_strides); pool_layer->setPadding(nv_paddings); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } pool_layer->setAverageCountExcludesPadding(exclusive); layer = pool_layer; } else { diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index ea630a9c6db90..5bfd2f1277795 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -172,6 +172,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); if (paddings.size() > 2) return false; + if (desc.HasAttr("exclusive")) { + if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) { + std::vector ksize = + BOOST_GET_CONST(std::vector, desc.GetAttr("ksize")); + for (size_t i = 0; i < ksize.size(); i++) { + if (ksize[i] <= paddings[i]) { + VLOG(3) << "the padding size should be less than the filter size " + "for exclusive-counting pooling."; + return false; + } + } + } + } + if (desc.HasAttr("ceil_mode")) { + if (BOOST_GET_CONST(bool, desc.GetAttr("ceil_mode"))) return false; + } if (desc.Input("X").size() != 1) { VLOG(3) << "TRT Pool2d expect 1 input, but got " << desc.Input("X").size(); @@ -440,6 +456,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "anchor_generator") { + if (!with_dynamic_shape) return false; + } + if (op_type == "yolo_box") { if (with_dynamic_shape) return false; bool has_attrs = diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py new file mode 100644 index 0000000000000..bf457a9da40a8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py @@ -0,0 +1,116 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertAnchorGeneratorTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input1(batch, attrs: List[Dict[str, Any]]): + return np.random.random([batch, 3, 64, 64]).astype(np.float32) + + for batch in [1, 2, 4]: + for anchor_sizes in [[64.0, 128.0, 256.0, 512.0]]: + for aspect_ratios in [[0.5, 1, 2], [0.4, 1.2, 3]]: + for variances in [[1.0, 1.0, 1.0, 1.0], + [0.5, 1.0, 0.5, 1.0]]: + for stride in [[16.0, 16.0], [16.0, 32.0]]: + for offset in [0.5, 0.8]: + dics = [{ + "anchor_sizes": anchor_sizes, + "aspect_ratios": aspect_ratios, + "variances": variances, + "stride": stride, + "offset": offset + }] + + ops_config = [{ + "op_type": "anchor_generator", + "op_inputs": { + "Input": ["input_data"] + }, + "op_outputs": { + "Anchors": ["output_anchors"], + "Variances": ["output_variances"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, + batch, dics)) + }, + outputs=[ + "output_anchors", "output_variances" + ]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 3 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py new file mode 100644 index 0000000000000..82dd492b5275f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py @@ -0,0 +1,227 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + weights = program_config.weights + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[ + 1] * attrs[0]['groups']: + return False + + if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]: + return False + + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(batch, num_channels, attrs: List[Dict[str, Any]]): + return np.ones([batch, num_channels, 64, 64]).astype(np.float32) + + def generate_weight1(num_channels, attrs: List[Dict[str, Any]]): + if attrs[0]['groups'] == 1: + return np.random.random( + [num_channels, num_channels, 3, 3]).astype(np.float32) + else: + return np.random.random( + [num_channels, int(num_channels / 2), 3, 3]).astype( + np.float32) + + for num_channels in [2, 4, 6]: + for batch in [1, 2, 4]: + for strides in [[1, 1], [2, 2], [1, 2]]: + for paddings in [[0, 3], [1, 2, 3, 4]]: + for groups in [2]: + for padding_algorithm in [ + 'EXPLICIT', 'SAME', 'VALID' + ]: + for dilations in [[1, 1], [2, 2], [1, 2]]: + for data_format in ['NCHW']: + + self.num_channels = num_channels + dics = [{ + "data_fromat": data_format, + "dilations": dilations, + "padding_algorithm": + padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides, + "data_format": data_format, + "output_size": [], + "output_padding": [] + }] + + ops_config = [{ + "op_type": "conv2d_transpose", + "op_inputs": { + "Input": ["input_data"], + "Filter": ["conv2d_weight"] + }, + "op_outputs": { + "Output": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config( + ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_weight": + TensorConfig(data_gen=partial( + generate_weight1, + num_channels, dics)) + }, + inputs={ + "input_data": + TensorConfig(data_gen=partial( + generate_input1, batch, + num_channels, dics)) + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if self.num_channels == 2: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 2, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 2, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 2, 64, 64], + "output_data": [1, 24, 64, 64] + } + elif self.num_channels == 4: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 4, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 4, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 4, 64, 64], + "output_data": [1, 24, 64, 64] + } + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 6, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 6, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 6, 64, 64], + "output_data": [1, 24, 64, 64] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-5) + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if program_config.ops[0].attrs[ + 'padding_algorithm'] == "SAME" or program_config.ops[ + 0].attrs['padding_algorithm'] == "VALID": + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op." + ) + + def teller2(program_config, predictor_config): + if program_config.ops[0].attrs['dilations'][ + 0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." + ) + + def test(self): + self.add_skip_trt_case() + self.run_test() + + def test_quant(self): + self.add_skip_trt_case() + self.run_test(quant=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py new file mode 100644 index 0000000000000..e6b3aa30bf896 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py @@ -0,0 +1,203 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertDepthwiseConv2dTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + weights = program_config.weights + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[ + 1] * attrs[0]['groups']: + return False + + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(batch, attrs: List[Dict[str, Any]]): + if attrs[0]['groups'] == 1: + return np.ones([batch, 1, 64, 64]).astype(np.float32) + elif attrs[0]['groups'] == 2: + return np.ones([batch, 2, 64, 64]).astype(np.float32) + else: + return np.ones([batch, 3, 64, 64]).astype(np.float32) + + def generate_weight1(attrs: List[Dict[str, Any]]): + return np.random.random([24, 1, 3, 3]).astype(np.float32) + + for batch in [1, 2, 4]: + for strides in [[1, 1], [2, 2], [1, 2]]: + for paddings in [[0, 3], [1, 2, 3, 4]]: + for groups in [1, 2, 3]: + for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']: + for dilations in [[1, 1], [2, 2], [1, 2]]: + for data_format in ['NCHW']: + + dics = [{ + "data_fromat": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides, + "data_format": data_format + }] + + ops_config = [{ + "op_type": "depthwise_conv2d", + "op_inputs": { + "Input": ["input_data"], + "Filter": ["conv2d_weight"] + }, + "op_outputs": { + "Output": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_weight": + TensorConfig(data_gen=partial( + generate_weight1, dics)) + }, + inputs={ + "input_data": + TensorConfig(data_gen=partial( + generate_input1, batch, dics)) + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if attrs[0]['groups'] == 1: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 1, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 1, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 1, 64, 64], + "output_data": [1, 24, 64, 64] + } + elif attrs[0]['groups'] == 2: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 2, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 2, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 2, 64, 64], + "output_data": [1, 24, 64, 64] + } + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 64, 64], + "output_data": [1, 24, 64, 64] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-5) + + # for dynamic_shape + + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if program_config.ops[0].attrs[ + 'padding_algorithm'] == "SAME" or program_config.ops[ + 0].attrs['padding_algorithm'] == "VALID": + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op." + ) + + def test(self): + self.add_skip_trt_case() + self.run_test() + + def test_quant(self): + self.add_skip_trt_case() + self.run_test(quant=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py new file mode 100644 index 0000000000000..473925c6cdb79 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py @@ -0,0 +1,191 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertDepthwiseConv2dTransposeTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + weights = program_config.weights + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[ + 1] * attrs[0]['groups']: + return False + + if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[1]: + return False + + if inputs['input_data'].shape[1] != attrs[0]['groups']: + return False + + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(batch, attrs: List[Dict[str, Any]]): + return np.ones( + [batch, attrs[0]['groups'], 64, 64]).astype(np.float32) + + def generate_weight1(attrs: List[Dict[str, Any]]): + return np.random.random( + [attrs[0]['groups'], 1, 3, 3]).astype(np.float32) + + for batch in [1, 2, 4]: + for strides in [[1, 1], [2, 2], [1, 2]]: + for paddings in [[0, 3], [1, 2, 3, 4]]: + for groups in [1, 2, 3]: + for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']: + for dilations in [[1, 1], [2, 2], [1, 2]]: + for data_format in ['NCHW']: + + dics = [{ + "data_fromat": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides, + "data_format": data_format, + "output_size": [], + "output_padding": [] + }] + + ops_config = [{ + "op_type": "conv2d_transpose", + "op_inputs": { + "Input": ["input_data"], + "Filter": ["conv2d_weight"] + }, + "op_outputs": { + "Output": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_weight": + TensorConfig(data_gen=partial( + generate_weight1, dics)) + }, + inputs={ + "input_data": + TensorConfig(data_gen=partial( + generate_input1, batch, dics)) + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "input_data": [1, attrs[0]['groups'], 32, 32], + "output_data": [1, attrs[0]['groups'], 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, attrs[0]['groups'], 64, 64], + "output_data": [4, attrs[0]['groups'], 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, attrs[0]['groups'], 64, 64], + "output_data": [1, attrs[0]['groups'], 64, 64] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-5) + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if program_config.ops[0].attrs[ + 'padding_algorithm'] == "SAME" or program_config.ops[ + 0].attrs['padding_algorithm'] == "VALID": + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op." + ) + + def teller2(program_config, predictor_config): + if program_config.ops[0].attrs['dilations'][ + 0] != 1 or program_config.ops[0].attrs['dilations'][1] != 1: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." + ) + + def test(self): + self.add_skip_trt_case() + self.run_test() + + def test_quant(self): + self.add_skip_trt_case() + self.run_test(quant=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py new file mode 100644 index 0000000000000..3e923b1bd89d6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py @@ -0,0 +1,148 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertPool2dTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(attrs: List[Dict[str, Any]]): + return np.ones([1, 3, 64, 64]).astype(np.float32) + + def generate_weight1(attrs: List[Dict[str, Any]]): + return np.random.random([24, 3, 3, 3]).astype(np.float32) + + for strides in [[1, 1], [2, 2], [1, 2]]: + for paddings in [[0, 2], [0, 3], [1, 2, 3, 4]]: + for pooling_type in ['max', 'avg']: + for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']: + for ksize in [[2, 3], [3, 3]]: + for data_format in ['NCHW']: + for global_pooling in [True, False]: + for exclusive in [True, False]: + for adaptive in [True, False]: + for ceil_mode in [True, False]: + self.paddings = paddings + + dics = [{ + "pooling_type": + pooling_type, + "ksize": ksize, + "data_fromat": data_format, + "padding_algorithm": + padding_algotithm, + "paddings": paddings, + "strides": strides, + "data_format": data_format, + "global_pooling": + global_pooling, + "exclusive": exclusive, + "adaptive": adaptive, + "ceil_mode": ceil_mode + }] + + ops_config = [{ + "op_type": "pool2d", + "op_inputs": { + "X": ["input_data"], + }, + "op_outputs": { + "Out": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config( + ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": + TensorConfig( + data_gen=partial( + generate_input1, + dics)) + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.paddings == [0, 3] or attrs[0][ + 'global_pooling'] == True or attrs[0]['ceil_mode'] == True: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if len(program_config.ops[0].attrs['paddings']) == 4: + return True + return False + + self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "4-dims paddings are not support for trt now.") + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +if __name__ == "__main__": + unittest.main() From 1691dc7a9c0a3e861a16e58c5508e3e7233be27d Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Fri, 24 Sep 2021 16:43:53 +0800 Subject: [PATCH 5/7] add update (#36017) --- .../paddle/distributed/fleet/meta_parallel/pipeline_parallel.py | 1 + python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py | 1 + 2 files changed, 2 insertions(+) diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 8fad0686dd42e..431bc6d7bc389 100755 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -329,6 +329,7 @@ def _broadcast_final_loss(self): def _optimizer_step(self): if self.scaler: self.scaler.step(self.optimizer) + self.scaler.update() else: self.optimizer.step() diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py index 083ad319305f3..4c966585d5f1f 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py @@ -48,6 +48,7 @@ def train_batch(self, batch, model, optimizer, is_mp): scaled.backward() # do backward scaler.step(optimizer) # update parameters + scaler.update() optimizer.clear_grad() return scaled From 787273eda531ae0cd651532eb39850a3614bfd67 Mon Sep 17 00:00:00 2001 From: piotrekobiIntel Date: Fri, 24 Sep 2021 11:05:03 +0200 Subject: [PATCH 6/7] Added elementwise_sub_mkldnn operator (#35662) * Add elementwise_sub_mkldnn_op without grad * Add test to static_mode_white_list * Refactor code, change license years * Remove invalid grad implementation * Fix element_wise_sub_op test * Fix CI Approval error * Remove unnecessary EltwiseSubMKLDNNGradKernel class * Fix CI Approval 2 * Fix CI Approval 3 * Fix CI Approval Attempt #4 * Fix CI Approve Attempt #5 * Fix CI Approval Attempt #6 * Fix CI Approval Attemt #7 * Change test names containing add to sub * Fix old tests testing add instead of sub * Copy grad implementation from elementwise_add_mkldnn * CI test fix attempt * Revert "CI test fix attempt" This reverts commit c647cacf41e6a87c715385a185de5cbf65fc8900. * Fix CI attempt 2 * Fix elementwise_sub tests, temporary mkldnn broadcast test disable * Add working implementation of elementwise_sub grad * Fix build errors caused by pull * Fix format error * Fix format error 2 * Disable elementwise_sub_mkldnn test on GPU * Apply fix for paddle.fluid import * Revert changes of test_elementwise_sub and Fix mkldnn test * Revert "Apply fix for paddle.fluid import" This reverts commit fc3b122fec8e12f2bcb32928a2685ba4d20fd742. * fix bug of module 'paddle' has no attribute 'fluid' for python3.6 (#35862) * Add changes suggested by reviewers * Change @unittest.skipIf... to @OpTestTool.skip_if_not_cpu_bf16() to satisfy Approval CI * Remove check_dygraph=False to satisify CI Approval Co-authored-by: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> --- .../mkldnn/elementwise_sub_mkldnn_op.cc | 132 ++++++++++ paddle/fluid/platform/mkldnn_reuse.h | 16 +- .../mkldnn/test_elementwise_sub_mkldnn_op.py | 236 ++++++++++++++++++ tools/static_mode_white_list.py | 1 + 4 files changed, 380 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc new file mode 100644 index 0000000000000..be8dad62c3c05 --- /dev/null +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -0,0 +1,132 @@ + +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" +namespace paddle { +namespace framework { +class ExecutionContext; +} // namespace framework +namespace platform { +class CPUDeviceContext; +struct CPUPlace; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { +template +class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; + + auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + auto tz = framework::vectorize(dout->dims()); + memory::data_type dout_type = framework::ToMKLDNNDataType(dout->type()); + platform::ReorderMKLDNNHandler handler(tz, dout->type(), dout_type, + onednn_engine); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + auto reorder_src_memory_p = handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + if (dx) { + auto reorder_dst_memory_p = + handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); + auto reorder_p = + handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); + + dx->set_layout(DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } + + if (dy) { + // Direct copy + if (dout->dims() == dy->dims()) { + auto reorder_dst_memory_p = + handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); + + dnnl::primitive_attr reorder_attr; + std::vector scales = {-1}; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = std::make_shared( + *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, + *reorder_dst_memory_p); + astream.wait(); + + dy->set_layout(DataLayout::kMKLDNN); + dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } else { + // Broadcasting + + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); + dnnl::primitive_attr attr; + attr.set_post_ops(po); + + platform::ReductionMKLDNNHandler handler_sum( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, + ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr); + + auto dy_memory_p = handler_sum.AcquireDstMemory(dy); + auto reduction_p = handler_sum.AcquireForwardPrimitive(); + + reduction_p->execute(astream, { + {DNNL_ARG_SRC, *reorder_src_memory_p}, + {DNNL_ARG_DST, *dy_memory_p}, + }); + astream.wait(); + + dy->set_layout(DataLayout::kMKLDNN); + dy->set_format( + platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( + paddle::framework::vectorize(dy->dims())))); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL( + elementwise_sub, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseSubMKLDNNGradKernel, + ops::EltwiseSubMKLDNNGradKernel) diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index d6ab9e50a066e..1aa8c0cdb57f9 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include + #include "boost/optional.hpp" #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/operator.h" @@ -927,7 +928,6 @@ class BroadcastDataMKLDNNHandler std::shared_ptr AcquireDstMemory(framework::Tensor* output) { T_out* ptr = output->mutable_data( this->place_, this->fwd_pd_->dst_desc().get_size()); - ; memset(ptr, 0, this->fwd_pd_->dst_desc().get_size()); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); } @@ -940,7 +940,8 @@ class ReductionMKLDNNHandler ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p, const float eps, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, - const Tensor* y, std::vector y_tz) + const Tensor* y, std::vector y_tz, + const dnnl::primitive_attr& attr = NULL) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { PADDLE_ENFORCE_EQ( @@ -957,7 +958,10 @@ class ReductionMKLDNNHandler const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType(), x->format()); - this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps); + if (attr) + this->AcquireForwardPrimitiveDescriptor(attr, algo, x_md, y_md, p, eps); + else + this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps); } }; @@ -979,8 +983,9 @@ class ActivationMKLDNNHandler if (ctx.Type() == "scale") { bool bias_after_scale = ctx.Attr("bias_after_scale"); auto* scale_tensor = ctx.Input("ScaleTensor"); - alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") - : (float)*(scale_tensor->data()); + alpha = (scale_tensor == nullptr) + ? ctx.Attr("scale") + : static_cast(*(scale_tensor->data())); beta = ctx.Attr("bias"); // if bias_after_scale == true // out = scale*X + bias @@ -1504,6 +1509,7 @@ static void SetDstMemoryQuantized( T* output_data = output->mutable_data(ctx.GetPlace()); const size_t dst_dims = dst_tz.size(); MKLDNNMemoryFormat dst_fmt; + PADDLE_ENFORCE_LE(dst_dims, 5, platform::errors::InvalidArgument( "Dst memory for quantization can not have " "dims > 5. But received dst_dims is %d.", diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py new file mode 100644 index 0000000000000..62c8c9571b793 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py @@ -0,0 +1,236 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +from paddle import enable_static +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 +from paddle.fluid.framework import _current_expected_place +import paddle.fluid.core as core + + +@OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)), + "GPU is not supported") +class TestMKLDNNElementwiseSubOp(OpTest): + def setUp(self): + self.op_type = "elementwise_sub" + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ignore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ignore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + def init_axis(self): + self.axis = -1 + + def init_kernel_type(self): + self.use_mkldnn = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output() + + +class TestMKLDNNElementwiseSubOp2(TestMKLDNNElementwiseSubOp): + def init_input_output(self): + self.x = np.random.random((100, )).astype(self.dtype) + self.y = np.random.random((100, )).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + +class TestMKLDNNElementwiseSubOp3(TestMKLDNNElementwiseSubOp): + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + +class TestMKLDNNElementwiseSubOp4(TestMKLDNNElementwiseSubOp): + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + +class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp): + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [100]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + +class TestMKLDNNElementwiseSubOp_broadcast(TestMKLDNNElementwiseSubOp): + def init_input_output(self): + self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype) + self.y = np.random.rand(10, 12).astype(self.dtype) + self.out = self.x - self.y.reshape(1, 10, 12, 1) + + def init_axis(self): + self.axis = 1 + + +class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp): + def init_input_output(self): + self.x = np.random.rand(10, 12).astype(self.dtype) + self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype) + self.out = self.x - self.y + + def init_axis(self): + self.axis = 2 + + def test_check_grad_normal(self): + pass + + def test_check_grad_ignore_y(self): + pass + + def test_check_grad_ignore_x(self): + pass + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestBf16(TestMKLDNNElementwiseSubOp): + def setUp(self): + self.op_type = "elementwise_sub" + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.x_bf16 = convert_float_to_uint16(self.x) + self.y_bf16 = convert_float_to_uint16(self.y) + self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16} + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': convert_float_to_uint16(self.out)} + + def init_dtype(self): + self.dtype = np.float32 + self.mkldnn_data_type = "bfloat16" + + def init_input_output(self): + self.x = np.random.random(100, ).astype(self.dtype) + self.y = np.random.random(100, ).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad_normal(self): + self.check_grad_with_place( + core.CPUPlace(), ["X", "Y"], + "Out", + user_defined_grads=[self.x, -self.x], + user_defined_grad_outputs=[self.x_bf16]) + + def test_check_grad_ignore_x(self): + self.check_grad_with_place( + core.CPUPlace(), ["Y"], + "Out", + user_defined_grads=[-self.y], + user_defined_grad_outputs=[self.y_bf16]) + + def test_check_grad_ignore_y(self): + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + user_defined_grads=[self.x], + user_defined_grad_outputs=[self.x_bf16]) + + +class TestBf16Broadcasting(TestBf16): + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [100]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def compute_reduced_gradients(self, out_grads): + part_sum = np.add.reduceat(out_grads, [0], axis=0) + part_sum = np.add.reduceat(part_sum, [0], axis=1) + part_sum = np.add.reduceat(part_sum, [0], axis=2) + return -part_sum.flatten() + + def test_check_grad_normal(self): + self.check_grad_with_place( + core.CPUPlace(), ["X", "Y"], + "Out", + user_defined_grads=[ + self.x, self.compute_reduced_gradients(self.x) + ], + user_defined_grad_outputs=[self.x_bf16]) + + def test_check_grad_ignore_x(self): + self.check_grad_with_place( + core.CPUPlace(), ["Y"], + "Out", + user_defined_grads=[self.compute_reduced_gradients(self.x)], + user_defined_grad_outputs=[self.x_bf16]) + + +class TestInt8(TestMKLDNNElementwiseSubOp): + def init_kernel_type(self): + self.use_mkldnn = True + self._cpu_only = True + + def init_dtype(self): + self.dtype = np.int8 + + def init_input_output(self): + self.x = np.random.randint(0, 3, (12, 9)).astype("int8") + self.y = np.random.randint(0, 3, (12, 9)).astype("int8") + self.out = np.subtract(self.x, self.y) + + def init_scales(self): + self.attrs['Scale_x'] = 1.0 + self.attrs['Scale_y'] = 1.0 + self.attrs['Scale_out'] = 1.0 + + def test_check_output(self): + self.init_scales() + self.check_output() + + def test_check_grad_normal(self): + pass + + def test_check_grad_ignore_x(self): + pass + + def test_check_grad_ignore_y(self): + pass + + +if __name__ == '__main__': + enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 43281d4375ed0..7d0a2a8953fc8 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -610,6 +610,7 @@ 'test_dequantize_mkldnn_op', 'test_elementwise_add_mkldnn_op', 'test_elementwise_add_bf16_mkldnn_op', + 'test_elementwise_sub_mkldnn_op', 'test_elementwise_mul_mkldnn_op', 'test_elementwise_mul_bf16_mkldnn_op', 'test_fc_mkldnn_op', From b91e8eec405ea6440124149cf6d21d559c7fded1 Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Fri, 24 Sep 2021 17:11:00 +0800 Subject: [PATCH 7/7] add gradient kernel of det op and slogdet op (#36013) * add gradient kernel of det op and slogdet op * fix CI APPROVAL problem --- paddle/fluid/operators/determinant_op.cc | 11 +- paddle/fluid/operators/determinant_op.cu | 36 --- paddle/fluid/operators/determinant_op.h | 262 ++++++++++++++++-- .../tests/unittests/test_determinant_op.py | 32 +-- 4 files changed, 266 insertions(+), 75 deletions(-) diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 379a401cde62e..98247fbc862bb 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -48,6 +48,8 @@ class DeterminantGradOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "DeterminantGradOp"); OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), "DeterminantGradOp"); OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", framework::GradVarName("Input"), "DeterminantGradOp"); @@ -117,7 +119,8 @@ class SlogDeterminantGradOp : public framework::OperatorWithKernel { "SlogDeterminantGradOp"); OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "SlogDeterminantGradOp"); - + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), "SlogDeterminantGradOp"); OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", framework::GradVarName("Input"), "SlogDeterminantGradOp"); @@ -179,7 +182,7 @@ REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp, ops::SlogDeterminantGradOpMaker); REGISTER_OPERATOR(slogdeterminant_grad, - ops::DeterminantGradOp) // reuse det grad op + ops::SlogDeterminantGradOp) // reuse det grad op REGISTER_OP_CPU_KERNEL( slogdeterminant, ops::SlogDeterminantKernel, @@ -187,5 +190,5 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( slogdeterminant_grad, - ops::DeterminantGradKernel, - ops::DeterminantGradKernel); + ops::SlogDeterminantGradKernel, + ops::SlogDeterminantGradKernel); diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu index f17d94d805228..d19d4c3d09386 100644 --- a/paddle/fluid/operators/determinant_op.cu +++ b/paddle/fluid/operators/determinant_op.cu @@ -14,42 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/determinant_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; - -template -__global__ void DeterminantGrad(const size_t numel, T* out) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < numel) { - out[tid] = static_cast(1); - } -} - -template -class DeterminantGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* dout = context.Input(framework::GradVarName("Out")); - const T* dout_data = dout->data(); - auto dout_dim = vectorize(dout->dims()); - - auto* dx = context.Output(framework::GradVarName("Input")); - T* dx_data = dx->mutable_data(context.GetPlace()); - - int64_t numel = dx->numel(); - for (int64_t idx = 0; idx < numel; idx++) { - dx_data[idx] = static_cast(1); - } - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index ead1262d9fe06..4c17869fb5d2a 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,7 +19,11 @@ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/fluid/operators/math/matrix_inverse.h" +#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { @@ -48,11 +52,10 @@ class EigenMatrix { inline int64_t GetBatchCount(const framework::DDim dims) { int64_t batch_count = 1; auto dim_size = dims.size(); - PADDLE_ENFORCE_GT(dim_size, 2, - platform::errors::InvalidArgument( - "To get the number of batch square matrices, " - "the size of dimension should greater than 2.", - dim_size)); + PADDLE_ENFORCE_GE( + dim_size, 2, + platform::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); // Cumulative multiplying each dimension until the last 2 to get the batch // count, @@ -77,7 +80,7 @@ struct DeterminantFunctor { auto end_iter = input_vec.begin() + (i + 1) * rank * rank; std::vector sub_vec(begin_iter, end_iter); // get every square matrix data - Eigen::MatrixXf matrix(rank, rank); + typename EigenMatrix::MatrixType matrix(rank, rank); for (int64_t i = 0; i < rank; ++i) { for (int64_t j = 0; j < rank; ++j) { matrix(i, j) = sub_vec[rank * i + j]; @@ -109,41 +112,169 @@ class DeterminantKernel : public framework::OpKernel { "the input matrix should be square matrix.")); auto rank = input_dim[input_dim_size - 1]; // square matrix length DeterminantFunctor()(*input, context, rank, batch_count, output); + auto output_dims = + framework::slice_ddim(input->dims(), 0, input_dim_size - 2); if (input_dim_size > 2) { - auto output_dims = - framework::slice_ddim(input->dims(), 0, input_dim_size - 2); output->Resize(output_dims); + } else { + // when input is a two-dimension matrix, The det value is a number. + output->Resize({1}); } VLOG(2) << "output dim:" << output->dims(); } }; +template +struct FoundZeroFunctor { + FoundZeroFunctor(const T* x, int64_t numel, bool* res) + : x_(x), numel_(numel), res_(res) {} + HOSTDEVICE void operator()(size_t idx) const { + if (*res_ || idx >= static_cast(numel_)) { + // founded zero number + return; + } + *res_ = (x_[idx] == static_cast(0)); + } + const T* x_; + int64_t numel_; + bool* res_; +}; + +template +inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx, + const framework::Tensor* det) { + auto& dev_ctx = ctx.template device_context(); + auto numel = det->numel(); + + framework::Tensor dev_tensor; + auto* data = dev_tensor.mutable_data({1}, ctx.GetPlace()); + + // set false + math::SetConstant zero; + zero(dev_ctx, &dev_tensor, false); + + // find whether zero + platform::ForRange for_range(dev_ctx, numel); + FoundZeroFunctor functor(det->data(), numel, data); + for_range(functor); + + // copy to host + dev_ctx.Wait(); + framework::Tensor cpu_tensor; + framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor); + + // if founded zero, the matrix is not invertible + // else the matrix is invertible + auto* res = cpu_tensor.data(); + return !(*res); +} + template class DeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PADDLE_THROW(platform::errors::Unimplemented( - "Not support DeterminantGrad at this time.")); + auto& dev_ctx = context.template device_context(); + const auto* input = context.Input("Input"); + const auto* det = context.Input("Out"); + const auto* grad = + context.Input(framework::GradVarName("Out")); + auto* ddet = + context.Output(framework::GradVarName("Input")); + + auto input_dims_size = input->dims().size(); + if (input_dims_size > 2) { + PADDLE_ENFORCE_EQ( + grad->dims().size() + 2, input_dims_size, + platform::errors::InvalidArgument( + "The grad tensor of det dims size should 2 less than" + " input tensor's, but here differ %d", + input_dims_size - grad->dims().size())); + } else if (input_dims_size == 2) { + // input dims size 2 and grad dims size 1 is possible + PADDLE_ENFORCE_EQ( + grad->dims().size(), 1, + platform::errors::InvalidArgument( + "The grad tensor of det dims size should 2 less than" + " input tensor's, but here differ %d", + input_dims_size - grad->dims().size())); + } else { + // checked in forward, pass + } + + // Check Whether the matrix is invertible + // (matrix A not invertible) == (det(A)=0) + if (!CheckMatrixInvertible(context, det)) { + // The matrix is not invertible + VLOG(3) << "The input matrix not invertible!"; + ddet->Resize(input->dims()); + ddet->mutable_data(context.GetPlace()); + math::SetConstant zero; + zero(dev_ctx, ddet, static_cast(0.0f)); + return; + } + + // The matrix is invertible + // let |A| = Determinant(A) + // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf + // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, + // -1) + + math::DeviceIndependenceTensorOperations helper(context); + + // First: inverse(A) + framework::Tensor inverse_A; + // A must be square matrices! + inverse_A.Resize(input->dims()); + inverse_A.mutable_data(context.GetPlace()); + + math::MatrixInverseFunctor mat_inv; + mat_inv(dev_ctx, *input, &inverse_A); + + VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); + + // Second: inverse(A).transpose(-2, -1) + framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A); + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " + << transpose_inverse_A.dims(); + + // Third: dA * |A| + auto mul_dA_detA = helper.Mul(*grad, *det); + VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); + + // Fourth: unsqueeze(dA * |A|, [-1, -2]) + auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); + + // Finally: unsqueeze(dA * |A|) * inverse(A) + auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + + VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); + + framework::TensorCopy(res, context.GetPlace(), ddet); + + ddet->Resize(input->dims()); + VLOG(3) << "d|A| dims: " << ddet->dims(); } }; template struct SlogDeterminantFunctor { void operator()(const Tensor& input, const framework::ExecutionContext ctx, - int rank, int batch_count, Tensor* output) { + int64_t rank, int64_t batch_count, Tensor* output) { std::vector input_vec; std::vector sign_vec; std::vector log_vec; std::vector output_vec; framework::TensorToVector(input, ctx.device_context(), &input_vec); - for (int i = 0; i < batch_count; ++i) { // maybe can be parallel + for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel auto begin_iter = input_vec.begin() + i * rank * rank; auto end_iter = input_vec.begin() + (i + 1) * rank * rank; std::vector sub_vec(begin_iter, end_iter); // get every square matrix data typename EigenMatrix::MatrixType matrix(rank, rank); - for (int i = 0; i < rank; ++i) { - for (int j = 0; j < rank; ++j) { + for (int64_t i = 0; i < rank; ++i) { + for (int64_t j = 0; j < rank; ++j) { matrix(i, j) = sub_vec[rank * i + j]; } } @@ -185,6 +316,10 @@ class SlogDeterminantKernel : public framework::OpKernel { auto rank = input_dim[input_dim_size - 1]; // square matrix length SlogDeterminantFunctor()(*input, context, rank, batch_count, output); std::vector output_dim_vec(input_dim.begin(), input_dim.end() - 2); + if (input_dim.size() == static_cast(2)) { + // when input is a two-dimension matrix, The det value is a number. + output_dim_vec = {1}; + } output_dim_vec.insert(output_dim_vec.begin(), 2); // make the output dims as same as numpy auto output_dims = framework::make_ddim(output_dim_vec); @@ -197,8 +332,103 @@ template class SlogDeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PADDLE_THROW(platform::errors::Unimplemented( - "Not support SlogDeterminantGrad at this time.")); + auto& dev_ctx = context.template device_context(); + const auto* input = context.Input("Input"); + const auto* slogdet = context.Input("Out"); + const auto* grad = + context.Input(framework::GradVarName("Out")); + auto* dslogdet = + context.Output(framework::GradVarName("Input")); + + PADDLE_ENFORCE_EQ(grad->dims()[0], 2, + platform::errors::InvalidArgument( + "The grad tensor of SlogDet should contain two" + " grad: sign and absslogdet, but here %ld.", + grad->dims()[0])); + if (input->dims().size() > 2) { + PADDLE_ENFORCE_EQ( + grad->dims().size() + 1, input->dims().size(), + platform::errors::InvalidArgument( + "The grad tensor of slogdet dims size should 1 less than" + " input tensor's, but here differ %d", + input->dims().size() - grad->dims().size())); + } + + // Check Whether the matrix is invertible + // (matrix A not invertible) == (absslogdet(A)=0) + auto slogdet_vec = slogdet->Split(1, 0); + auto absslogdet_val = slogdet_vec[0]; + if (!CheckMatrixInvertible(context, &absslogdet_val)) { + // The matrix is not invertible + VLOG(3) << "The input matrix not invertible!"; + dslogdet->Resize(input->dims()); + dslogdet->mutable_data(context.GetPlace()); + math::SetConstant zero; + zero(dev_ctx, dslogdet, std::numeric_limits::quiet_NaN()); + return; + } + + // The matrix is invertible + // let sl|A| = SlogDeterminant(A) + // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf + // we set dsl|A| = unsqueeze(dslA, [-1, -2]) * + // inverse(A).conj().transpose(-2, -1) + + math::DeviceIndependenceTensorOperations helper(context); + + // First: inverse(A) + framework::Tensor inverse_A; + // A must be square matrices! + inverse_A.Resize(input->dims()); + inverse_A.mutable_data(context.GetPlace()); + + math::MatrixInverseFunctor mat_inv; + mat_inv(dev_ctx, *input, &inverse_A); + + VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); + + // Second: inverse(A).conj() + framework::Tensor conj_inverse_A; + conj_inverse_A.Resize(inverse_A.dims()); + auto numel = input->numel(); + auto* conj_data = conj_inverse_A.mutable_data(context.GetPlace(), + size_t(numel * sizeof(T))); + + platform::ForRange for_range(dev_ctx, numel); + math::ConjFunctor functor(inverse_A.data(), numel, conj_data); + for_range(functor); + + VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims(); + + // Third: inverse(A).conj().transpose(-2, -1) + framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A); + VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: " + << transpose_inverse_A.dims(); + + // Fourth: split grad value to [sign_grad, absslogdet_grad] + auto grad_vec = grad->Split(1, 0); + auto det_grad = grad_vec[1]; + + // remmove useless first dimension + int det_grad_size = det_grad.dims().size(); + std::vector det_grad_vec; + for (int i = 1; i < det_grad_size; ++i) { + det_grad_vec.emplace_back(det_grad.dims()[i]); + } + det_grad.Resize(det_grad.dims().reshape(det_grad_vec)); + + // Fifth: unsqueeze(dslA, [-1, -2]) + auto unsqueeze1 = helper.Unsqueeze(det_grad, -1); + auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims(); + + // Finally: unsqueeze(dslA) * inverse(A) + auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims(); + + framework::TensorCopy(res, context.GetPlace(), dslogdet); + dslogdet->Resize(input->dims()); + VLOG(3) << "dsl|A| dims: " << dslogdet->dims(); } }; diff --git a/python/paddle/fluid/tests/unittests/test_determinant_op.py b/python/paddle/fluid/tests/unittests/test_determinant_op.py index c19d44eb030cf..f8110bffa2f71 100644 --- a/python/paddle/fluid/tests/unittests/test_determinant_op.py +++ b/python/paddle/fluid/tests/unittests/test_determinant_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest import paddle import paddle.nn.functional as F import paddle.fluid as fluid @@ -26,7 +26,6 @@ paddle.enable_static() -@skip_check_grad_ci(reason="determinant grad is in progress.") class TestDeterminantOp(OpTest): def setUp(self): self.init_data() @@ -37,11 +36,11 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - pass + self.check_grad(['Input'], ['Out']) def init_data(self): np.random.seed(0) - self.case = np.random.rand(3, 3, 3, 3, 3).astype('float64') + self.case = np.random.rand(3, 3, 3, 5, 5).astype('float64') self.inputs = {'Input': self.case} self.target = np.linalg.det(self.case) @@ -49,30 +48,25 @@ def init_data(self): class TestDeterminantOpCase1(TestDeterminantOp): def init_data(self): np.random.seed(0) - self.case = np.random.rand(3, 3, 3, 3).astype(np.float32) + self.case = np.random.rand(10, 10).astype('float32') self.inputs = {'Input': self.case} self.target = np.linalg.det(self.case) - def test_check_grad(self): - pass - class TestDeterminantOpCase2(TestDeterminantOp): def init_data(self): np.random.seed(0) - self.case = np.random.rand(4, 2, 4, 4).astype('float64') + # not invertible matrix + self.case = np.ones([4, 2, 4, 4]).astype('float64') self.inputs = {'Input': self.case} self.target = np.linalg.det(self.case) - def test_check_grad(self): - pass - class TestDeterminantAPI(unittest.TestCase): def setUp(self): - self.shape = [3, 3, 3, 3] np.random.seed(0) - self.x = np.random.rand(3, 3, 3, 3).astype(np.float32) + self.shape = [3, 3, 5, 5] + self.x = np.random.random(self.shape).astype(np.float32) self.place = paddle.CPUPlace() def test_api_static(self): @@ -96,7 +90,6 @@ def test_api_dygraph(self): paddle.enable_static() -@skip_check_grad_ci(reason="slogdeterminant grad is in progress.") class TestSlogDeterminantOp(OpTest): def setUp(self): self.op_type = "slogdeterminant" @@ -107,11 +100,12 @@ def test_check_output(self): self.check_output() def test_check_grad(self): - pass + # the slog det's grad value is always huge + self.check_grad(['Input'], ['Out'], max_relative_error=0.1) def init_data(self): np.random.seed(0) - self.case = np.random.rand(3, 3, 3, 3).astype('float64') + self.case = np.random.rand(4, 5, 5).astype('float64') self.inputs = {'Input': self.case} self.target = np.array(np.linalg.slogdet(self.case)) @@ -126,9 +120,9 @@ def init_data(self): class TestSlogDeterminantAPI(unittest.TestCase): def setUp(self): - self.shape = [3, 3, 3, 3] np.random.seed(0) - self.x = np.random.rand(3, 3, 3, 3).astype(np.float32) + self.shape = [3, 3, 5, 5] + self.x = np.random.random(self.shape).astype(np.float32) self.place = paddle.CPUPlace() def test_api_static(self):