Skip to content

Commit

Permalink
Merge branch 'develop' into develop_api20_bilinear
Browse files Browse the repository at this point in the history
  • Loading branch information
wanghuancoder authored Aug 20, 2020
2 parents 8a85a53 + f00f982 commit 7d3402d
Show file tree
Hide file tree
Showing 152 changed files with 9,467 additions and 1,695 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/framework/fleet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ else()
cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_GLOO)

cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)

cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
4 changes: 4 additions & 0 deletions paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass,
paddle::framework::ir::ConvTransposeBNFusePass);
REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass);
REGISTER_PASS(depthwise_conv_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvBNFusePass);
REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
10 changes: 10 additions & 0 deletions paddle/fluid/framework/ir/conv_bn_fuse_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
std::string conv_type() const { return "conv2d_transpose"; }
};

class DepthwiseConvBNFusePass : public ConvBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};

class DepthwiseConvEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};

} // namespace ir
} // namespace framework
} // namespace paddle
3 changes: 2 additions & 1 deletion paddle/fluid/framework/ir/subgraph_detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
BriefNode *brief_node = itr.second;

if (!Agent(brief_node->node).marked()) {
VLOG(4) << brief_node->node->id() << " node not a trt candidate.";
VLOG(4) << brief_node->node->id() << " node named "
<< brief_node->node->Name() << " is not a trt candidate.";
continue;
}

Expand Down
17 changes: 17 additions & 0 deletions paddle/fluid/framework/prune.cc
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
should_run.push_back(true);
} else {
should_run.push_back(false);
// If the output of an op modifies feed vars, the op should not clip.
// For example, in the transformer structure, the third parameter returned
// by beam_search op is generally assigned to a feed var. Cutting the
// assign op will cause an error.
if (parent_block_id != -1) {
bool flag = false;
for (auto& var : op_desc.outputs()) {
for (auto& argu : var.arguments()) {
if (feed_var_names.count(argu)) {
flag = true;
}
}
}
if (flag) {
should_run.back() = true;
}
}
}
}

Expand Down
31 changes: 31 additions & 0 deletions paddle/fluid/framework/prune_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) {
EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
}

// If the output of an op modifies feed vars, the op should not clip.
TEST(Prune, recurrrent_op_2) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::BlockDesc *sub_block = program.AppendBlock(*block);
AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
f::AttributeMap{}, block);

std::vector<std::string> state_var_name(1, "y");
AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}},
{{"ex_states", state_var_name},
{"states", state_var_name},
{"sub_block", sub_block}},
block);

EXPECT_TRUE(sub_block != nullptr);
AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"a"}}},
f::AttributeMap{}, sub_block);

f::proto::ProgramDesc *pdesc = program.Proto();
pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);

f::proto::ProgramDesc pruned;
std::set<std::string> feed_var_names = {"x", "a"};

f::Prune(*pdesc, feed_var_names, &pruned);
EXPECT_EQ(pruned.blocks_size(), 2);
EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
}
7 changes: 6 additions & 1 deletion paddle/fluid/inference/tensorrt/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
} else if (shape.size() == 3UL) {
return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
}
return nvinfer1::Dims4(shape[0], shape[1], 1, 1);
nvinfer1::Dims dims;
dims.nbDims = shape.size();
for (size_t i = 0; i < shape.size(); i++) {
dims.d[i] = shape[i];
}
return dims;
}
}
} // NOLINT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
return ret;
}

template <typename T>
void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
for (auto ptr : embs_gpu_) {
if (ptr) cudaFree(ptr);
}

if (bias_gpu_) cudaFree(bias_gpu_);
if (scale_gpu_) cudaFree(scale_gpu_);
}

template <typename T>
bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
Expand Down Expand Up @@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
int64_t *emb_ptr_gpu_d =
emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));

std::vector<int64_t> in_ptr, emb_ptr;
std::vector<uintptr_t> in_ptr, emb_ptr;
for (int i = 0; i < input_num; i++) {
in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
}

nvinfer1::IPluginV2DynamicExt* clone() const override {
return new EmbEltwiseLayernormPluginDynamic(
auto ptr = new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
eps_);
ptr->embs_gpu_ = embs_gpu_;
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr;
}

const char* getPluginType() const override {
Expand Down Expand Up @@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
return sum_num;
}

void terminate() override;
void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_);
Expand Down
6 changes: 6 additions & 0 deletions paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,

#if IS_TRT_VERSION_GE(6000)

void PReluPluginDynamic::terminate() {
if (p_gpu_weight_) {
cudaFree(p_gpu_weight_);
}
}

int PReluPluginDynamic::initialize() {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
}
~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
ptr->p_gpu_weight_ = p_gpu_weight_;
return ptr;
}

const char* getPluginType() const override { return "prelu_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override;
void terminate() override;

size_t getSerializationSize() const override;
void serialize(void* buffer) const override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
}

nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SkipLayerNormPluginDynamic(
auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = bias_gpu_;
return ptr;
}

const char* getPluginType() const override { return "skip_layernorm_plugin"; }
Expand Down
11 changes: 1 addition & 10 deletions paddle/fluid/inference/tests/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -471,19 +471,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
endif()

inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)

set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
endif()

inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)

endif()

set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,11 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
AnalysisConfig* config_deser = new AnalysisConfig(config);

std::vector<float> out_data;
run(config, &out_data);
run(config, &out_data); // serialize
run(*config_deser, &out_data); // deserialize
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
std::vector<float> out_data;
run(config, &out_data);
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6);
EXPECT_NEAR(result[i], out_data[i], 1e-5);
}
}

Expand Down
15 changes: 9 additions & 6 deletions paddle/fluid/operators/allclose_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ namespace operators {
class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input", "The first input tensor to compare.");
AddInput("Other", "The second input tensor to compare.");
AddOutput("Out", "The output tensor of allclose op.");
AddInput("Input",
"The input tensor, it's data type should be float32, float64.");
AddInput("Other",
"The input tensor, it's data type should be float32, float64.");
AddOutput("Out", "The output tensor, it's data type is bool.");

AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
.SetDefault(1e-5);
Expand All @@ -36,11 +38,12 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(false);

AddComment(R"DOC(
This operator checks if all :math:`input` and :math:`other` satisfy the condition:
This operator checks if all :math:`x` and :math:`y` satisfy the condition:
:math:`\left| input - other \right| \leq atol + rtol \times \left| other \right|`
.. math::
\left| x - y \right| \leq atol + rtol \times \left| y \right|
elementwise, for all elements of :math:`input` and :math:`other`. The behaviour of this
elementwise, for all elements of :math:`x` and :math:`y`. The behaviour of this
operator is analogous to :math:`numpy.allclose`, namely that it returns :math:`True` if
two tensors are elementwise equal within a tolerance.
)DOC");
Expand Down
51 changes: 22 additions & 29 deletions paddle/fluid/operators/arg_max_op.cu
Original file line number Diff line number Diff line change
@@ -1,29 +1,22 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/arg_min_max_op_base.h"

REGISTER_OP_CUDA_KERNEL(
arg_max,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
double>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int64_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int32_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int16_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
uint8_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/arg_min_max_op_base.cu.h"

REGISTER_OP_CUDA_KERNEL(
arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
Loading

0 comments on commit 7d3402d

Please sign in to comment.