PaddlePaddle · luotao1 · Apr 12, 2023 · Apr 4, 2023 · Apr 4, 2023 · Apr 10, 2023
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2566,6 +2566,7 @@ USE_TRT_CONVERTER(expand_v2)
 USE_TRT_CONVERTER(take_along_axis)
 USE_TRT_CONVERTER(skip_groupnorm_act)
 USE_TRT_CONVERTER(preln_groupnorm_act)
+USE_TRT_CONVERTER(cumsum)
 #if IS_TRT_VERSION_GE(8522)
 USE_TRT_CONVERTER(flash_multihead_matmul)
 USE_TRT_CONVERTER(cross_multihead_matmul)

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -105,6 +105,7 @@ list(
   skip_groupnorm_act_op.cc
   preln_groupnorm_act_op.cc
   expand_v2_op.cc
+  cumsum_op.cc
   temporal_shift_op.cc)
 
 if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)

diff --git a/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc b/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc
@@ -0,0 +1,156 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Cumsum Op
+ */
+class CumsumOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+#if IS_TRT_VERSION_GE(7220)
+    VLOG(3) << "convert a cumsum op to tensorrt layer";
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_x_name = op_desc.Input("X").front();
+    std::string output_name = op_desc.Output("Out").front();
+    auto* input_x_tensor = engine_->GetITensor(input_x_name);
+    auto dims = input_x_tensor->getDimensions();
+    auto rank = dims.nbDims;
+    int axis = 0;
+    if (op_desc.HasAttr("axis")) {
+      axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis"));
+      if (axis < 0) {
+        axis += rank;
+      }
+    }
+
+    // getAxisLength default is a scalar
+    auto getAxisLength =
+        [&](nvinfer1::ITensor* inpTensor, int axis, bool scalar = true) {
+          auto dims = inpTensor->getDimensions();
+          int d = dims.d[axis];
+          if (d >= 0) {
+            return Add1DConstantLayer(d, "", scalar);
+          } else {
+            nvinfer1::ITensor* inpShape = Shape(inpTensor);
+            return GetEleTensorOfShape(inpShape, d, scalar);
+          }
+        };
+
+    // Create "inputSliced" tensor that is sliced on dimension[axis] to length 1
+    nvinfer1::Dims start;
+    start.nbDims = rank;
+    std::vector<int32_t> start_vec(rank, 0);
+    std::fill(start.d, start.d + rank, 0);
+
+    nvinfer1::Dims size;
+    size.nbDims = rank;
+    nvinfer1::Dims stride;
+    stride.nbDims = rank;
+    auto axisLength = getAxisLength(input_x_tensor, axis, false);
+
+    auto starts_tensor =
+        Add1DConstantLayer(start_vec, output_name + "_start_tensor_");
+    auto sizes_tensor = axis == 0 ? Add1DConstantLayer(1)
+                                  : getAxisLength(input_x_tensor, 0, false);
+    auto strides_tensor = axis == 0 ? axisLength : Add1DConstantLayer(1);
+
+    for (int i = 1; i < rank; i++) {
+      if (i == axis) {
+        std::vector<nvinfer1::ITensor*> strides_itensors;
+        strides_itensors.push_back(strides_tensor);
+        strides_itensors.push_back(axisLength);
+        strides_tensor = Concat(strides_itensors);
+        std::vector<nvinfer1::ITensor*> sizes_itensors;
+        sizes_itensors.push_back(sizes_tensor);
+        sizes_itensors.push_back(Add1DConstantLayer(1));
+        sizes_tensor = Concat(sizes_itensors);
+      } else {
+        auto currLength = getAxisLength(input_x_tensor, i, false);
+        std::vector<nvinfer1::ITensor*> strides_itensors;
+        strides_itensors.push_back(strides_tensor);
+        strides_itensors.push_back(Add1DConstantLayer(1));
+        strides_tensor = Concat(strides_itensors);
+        std::vector<nvinfer1::ITensor*> sizes_itensors;
+        sizes_itensors.push_back(sizes_tensor);
+        sizes_itensors.push_back(currLength);
+        sizes_tensor = Concat(sizes_itensors);
+      }
+    }
+
+    auto inputSliced = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *input_x_tensor, start, size, stride);
+    inputSliced->setInput(1, *starts_tensor);
+    inputSliced->setInput(2, *sizes_tensor);
+    inputSliced->setInput(3, *strides_tensor);
+    auto inputSliced_output = inputSliced->getOutput(0);
+
+    // Scan through each slice across axis and add it to the running sum
+    auto loop = TRT_ENGINE_ADD_LAYER(engine_, Loop);
+    nvinfer1::ITensor* tripLimit = getAxisLength(input_x_tensor, axis);
+    loop->addTripLimit(*tripLimit, nvinfer1::TripLimit::kCOUNT);
+    auto iterator = loop->addIterator(*input_x_tensor, axis);
+    auto data = iterator->getOutput(0);
+
+    // Squeeze inputSliced down to same shape as `data`
+    auto sliced_dims = inputSliced_output->getDimensions();
+    std::vector<int32_t> subscripts(sliced_dims.nbDims);
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    auto p = std::remove_if(subscripts.begin(),
+                            subscripts.end(),
+                            [axis](int x) { return x == axis; });
+    subscripts.resize(p - subscripts.begin());
+    auto newDims = Gather(Shape(inputSliced_output), subscripts);
+    inputSliced_output = Reshape(inputSliced_output, newDims);
+
+    // creat ZeroTensor
+    std::vector<float> zero_vec{0.f};
+    auto zero = Add1DConstantLayer(zero_vec);
+    zero = TRT_ENGINE_ADD_LAYER(engine_,
+                                ElementWise,
+                                *inputSliced_output,
+                                *BroadcastTensors(zero, inputSliced_output),
+                                nvinfer1::ElementWiseOperation::kPROD)
+               ->getOutput(0);
+    auto runningSum = loop->addRecurrence(*zero);
+    auto runningSumTensor = runningSum->getOutput(0);
+    auto curSum = TRT_ENGINE_ADD_LAYER(engine_,
+                                       ElementWise,
+                                       *data,
+                                       *runningSumTensor,
+                                       nvinfer1::ElementWiseOperation::kSUM);
+    runningSum->setInput(1, *curSum->getOutput(0));
+    auto reverseFlag = nvinfer1::LoopOutput::kCONCATENATE;
+    nvinfer1::ILoopOutputLayer* loopOut =
+        loop->addLoopOutput(*curSum->getOutput(0), reverseFlag, axis);
+    loopOut->setInput(1, *tripLimit);
+    RreplenishLayerAndOutput(loopOut, "cumsum", {output_name}, test_mode);
+#else
+    VLOG(3) << "Cumsum is not supported when TensorRT < 7.2.2";
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(cumsum, CumsumOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -416,6 +416,52 @@ class OpConverter {
     return TRT_ENGINE_ADD_LAYER(engine_, Shape, *input)->getOutput(0);
   }
 
+  nvinfer1::ITensor* Reshape(nvinfer1::ITensor* input,
+                             nvinfer1::ITensor* newShape) {
+    nvinfer1::ITensor* oldShape = Shape(input);
+    if (oldShape == newShape) {
+      return input;
+    }
+    auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+    shuffle->setInput(1, *newShape);
+    return shuffle->getOutput(0);
+  }
+
+  nvinfer1::ITensor* BroadcastTensor(nvinfer1::ITensor* input,
+                                     const int nbDims) {
+    auto oldShape = Shape(input);
+    auto oldShapeDims = oldShape->getDimensions();
+    const int rank = oldShapeDims.nbDims;
+    if (rank > nbDims) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot broadcast a higher rank tensor to a lower rank tensor."));
+    }
+    if (rank < nbDims) {
+      nvinfer1::ITensor* concat_shape_tensor;
+      auto* one_rank_tensor =
+          Add1DConstantLayer(std::vector<int32_t>(nbDims - rank, 1));
+      std::vector<nvinfer1::ITensor*> itensors;
+      itensors.push_back(one_rank_tensor);
+      itensors.push_back(oldShape);
+      concat_shape_tensor = Concat(itensors);
+      input = Reshape(input, concat_shape_tensor);
+    }
+    return input;
+  }
+
+  nvinfer1::ITensor* BroadcastTensors(nvinfer1::ITensor* a,
+                                      nvinfer1::ITensor* b) {
+    const int aDims = a->getDimensions().nbDims;
+    const int bDims = b->getDimensions().nbDims;
+    if (aDims == bDims) {
+      VLOG(3) << "Broadcast two equal rank tensors";
+    }
+    if (aDims > bDims) {
+      return BroadcastTensor(b, aDims);
+    }
+    return BroadcastTensor(a, bDims);
+  }
+
   // Concat not make rank changed
   nvinfer1::ITensor* Concat(const std::vector<nvinfer1::ITensor*>& inputs,
                             int axis = 0) {

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2703,6 +2703,25 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
     }
 
+    if (op_type == "cumsum") {
+#if !IS_TRT_VERSION_GE(7220)
+      VLOG(3) << "cumsum is not supported when TensorRT < 7.2.2";
+      return false;
+#endif
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the cumsum does not support "
+                   "static shape yet";
+        return false;
+      }
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+    }
+
     if (op_type == "temporal_shift") {
 #if !IS_TRT_VERSION_GE(8200)
       VLOG(3) << "temporal_shift is not supported when TensorRT < 8.2";
@@ -2904,7 +2923,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "skip_groupnorm_act",
       "preln_groupnorm_act",
       "temporal_shift",
-      "grid_sampler"};
+      "grid_sampler",
+      "cumsum"};
 
   std::unordered_set<std::string> teller_set{
       "mul",
@@ -3062,7 +3082,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "skip_groupnorm_act",
       "preln_groupnorm_act",
       "temporal_shift",
-      "grid_sampler"};
+      "grid_sampler",
+      "cumsum"};
 };
 
 struct GenericPluginTeller : public Teller {