cyyever
diff --git a/‎dali/kernels/imgproc/convolution/laplacian_gpu.cuh
+14-2 b/‎dali/kernels/imgproc/convolution/laplacian_gpu.cuh
+14-2
diff --git a/‎dali/kernels/imgproc/convolution/laplacian_windows.h
+125 b/‎dali/kernels/imgproc/convolution/laplacian_windows.h
+125
diff --git a/‎dali/kernels/imgproc/convolution/laplacian_windows_test.cc
+76 b/‎dali/kernels/imgproc/convolution/laplacian_windows_test.cc
+76
diff --git a/‎dali/operators/image/convolution/CMakeLists.txt
+2-1 b/‎dali/operators/image/convolution/CMakeLists.txt
+2-1
diff --git a/‎dali/operators/image/convolution/laplacian.cc
+19-10 b/‎dali/operators/image/convolution/laplacian.cc
+19-10
@@ -40,8 +40,8 @@ namespace laplacian {
  * @brief Computes convolution to obtain partial derivative in one of the dimensions.
  * Convolution consits of `axes` windows, each to convolve along one dimension of the input data,
  * where `deriv_axis`-th window is supposed to compute partial derivative along that axis,
- * whereas the remaining windows should perform smoothing. If no smoothing is necessary in a whole
- * batch, you can prevent smoothing convolutions form running by passing empty lists for
+ * whereas the remaining windows should perform smoothing. If no smoothing is necessary in
+ * the whole batch, you can prevent smoothing convolutions from running by passing empty lists for
  * `window_sizes[i]` such that `i != deriv_axis`.
  */
 template <typename Out, typename In, typename W, int axes, int deriv_axis, bool has_channels,
@@ -61,6 +61,18 @@ struct PartialDerivGpu {
     return false;
   }
 
+  /**
+   * @param ctx             Kernel context, used for scratch-pad.
+   * @param in_shape        List of input shapes, used by underlaying convolution kernels to infer
+   *                        intermediate buffer sizes.
+   * @param window_sizes    For given `i`, `window_sizes[i]` contains per-sample window sizes
+   *                        to be applied in a convolution along `i-th` axis. The length of
+   *                        `window_sizes[deriv_axis]` must be equal to the input batch size.
+   *                        Lists for other axes must either all have length equal to the input
+   *                        batch size or all be empty. In the latter case, smoothing convolutions
+   *                        will be omitted, i.e. only one convolution, along `deriv_axis`
+   *                        will be applied.
+   */
   KernelRequirements Setup(KernelContext& ctx, const TensorListShape<ndim>& in_shape,
                            const std::array<TensorListShape<1>, axes>& window_sizes) {
     has_smoothing_ = HasSmoothing(window_sizes);
 
@@ -0,0 +1,125 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_KERNELS_IMGPROC_CONVOLUTION_LAPLACIAN_WINDOWS_H_
+#define DALI_KERNELS_IMGPROC_CONVOLUTION_LAPLACIAN_WINDOWS_H_
+
+#include <vector>
+
+#include "dali/core/tensor_view.h"
+
+namespace dali {
+namespace kernels {
+
+template <typename T>
+class LaplacianWindows {
+ public:
+  explicit LaplacianWindows(int max_window_size) : smooth_computed_{1}, deriv_computed_{1} {
+    Resize(max_window_size);
+    *smoothing_views_[0](0) = 1;
+    *deriv_views_[0](0) = 1;
+  }
+
+  TensorView<StorageCPU, const T, 1> GetDerivWindow(int window_size) {
+    assert(1 <= window_size && window_size <= max_window_size_);
+    assert(window_size % 2 == 1);
+    auto window_idx = window_size / 2;
+    PrepareSmoothingWindow(window_size - 2);
+    PrepareDerivWindow(window_size);
+    return deriv_views_[window_idx];
+  }
+
+  TensorView<StorageCPU, const T, 1> GetSmoothingWindow(int window_size) {
+    assert(1 <= window_size && window_size <= max_window_size_);
+    assert(window_size % 2 == 1);
+    auto window_idx = window_size / 2;
+    PrepareSmoothingWindow(window_size);
+    return smoothing_views_[window_idx];
+  }
+
+ private:
+  /**
+   * @brief Smoothing window of size 2n + 1 is [1, 2, 1] conv composed with itself n - 1 times
+   * so that the window has appropriate size: it boils down to computing binominal coefficients:
+   * (1 + 1) ^ (2n).
+   */
+  inline void PrepareSmoothingWindow(int window_size) {
+    for (; smooth_computed_ < window_size; smooth_computed_++) {
+      auto cur_size = smooth_computed_ + 1;
+      auto cur_idx = cur_size / 2;
+      auto &prev_view = smoothing_views_[cur_size % 2 == 0 ? cur_idx - 1 : cur_idx];
+      auto &view = smoothing_views_[cur_idx];
+      auto prev_val = *prev_view(0);
+      *view(0) = prev_val;
+      for (int j = 1; j < cur_size - 1; j++) {
+        auto val = *prev_view(j);
+        *view(j) = prev_val + *prev_view(j);
+        prev_val = val;
+      }
+      *view(cur_size - 1) = prev_val;
+    }
+  }
+
+  /**
+   * @brief Derivative window of size 3 is [1, -2, 1] (which is [1, -1] composed with itself).
+   * Bigger windows are convolutions of smoothing windows with [1, -2, 1].
+   */
+  inline void PrepareDerivWindow(int window_size) {
+    for (; deriv_computed_ < window_size; deriv_computed_++) {
+      auto cur_size = deriv_computed_ + 1;
+      auto cur_idx = cur_size / 2;
+      auto &prev_view = cur_size % 2 == 0 ? smoothing_views_[cur_idx - 1] : deriv_views_[cur_idx];
+      auto &view = deriv_views_[cur_idx];
+      auto prev_val = *prev_view(0);
+      *view(0) = -prev_val;
+      for (int j = 1; j < cur_size - 1; j++) {
+        auto val = *prev_view(j);
+        *view(j) = prev_val - *prev_view(j);
+        prev_val = val;
+      }
+      *view(cur_size - 1) = prev_val;
+    }
+  }
+
+  void Resize(int max_window_size) {
+    assert(1 <= max_window_size && max_window_size % 2 == 1);
+    max_window_size_ = max_window_size;
+    int num_windows = (max_window_size + 1) / 2;
+    int num_elements = num_windows * num_windows;
+    smoothing_memory_.resize(num_elements);
+    deriv_memory_.resize(num_elements);
+    smoothing_views_.resize(num_windows);
+    deriv_views_.resize(num_windows);
+    int offset = 0;
+    int window_size = 1;
+    for (int i = 0; i < num_windows; i++) {
+      smoothing_views_[i] = {&smoothing_memory_[offset], {window_size}};
+      deriv_views_[i] = {&deriv_memory_[offset], {window_size}};
+      offset += window_size;
+      window_size += 2;
+    }
+  }
+
+  int smooth_computed_, deriv_computed_;
+  int max_window_size_;
+  std::vector<T> smoothing_memory_;
+  std::vector<T> deriv_memory_;
+  std::vector<TensorView<StorageCPU, T, 1>> smoothing_views_;
+  std::vector<TensorView<StorageCPU, T, 1>> deriv_views_;
+};
+
+}  // namespace kernels
+}  // namespace dali
+
+#endif  // DALI_KERNELS_IMGPROC_CONVOLUTION_LAPLACIAN_WINDOWS_H_
@@ -0,0 +1,76 @@
+// Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include <opencv2/imgproc.hpp>
+
+#include "dali/kernels/common/utils.h"
+#include "dali/test/tensor_test_utils.h"
+#include "dali/test/test_tensors.h"
+
+#include "dali/kernels/imgproc/convolution/laplacian_windows.h"
+
+namespace dali {
+namespace kernels {
+
+void CheckDerivWindow(int window_size, LaplacianWindows<float> &windows) {
+  cv::Mat d, s;
+  cv::getDerivKernels(d, s, 2, 0, window_size, true, CV_32F);
+  const auto &window_view = windows.GetDerivWindow(window_size);
+  float d_scale = std::exp2f(-window_size + 3);
+  for (int i = 0; i < window_size; i++) {
+    EXPECT_NEAR(window_view.data[i] * d_scale, d.at<float>(i), 1e-6f)
+        << "window_size: " << window_size << ", position: " << i;
+  }
+}
+
+void CheckSmoothingWindow(int window_size, LaplacianWindows<float> &windows) {
+  cv::Mat d, s;
+  cv::getDerivKernels(d, s, 2, 0, window_size, true, CV_32F);
+  const auto &window_view = windows.GetSmoothingWindow(window_size);
+  float s_scale = std::exp2f(-window_size + 1);
+  for (int i = 0; i < window_size; i++) {
+    EXPECT_NEAR(window_view.data[i] * s_scale, s.at<float>(i), 1e-6f)
+        << "window_size: " << window_size << ", position: " << i;
+  }
+}
+
+TEST(LaplacianWindowsTest, GetDerivWindows) {
+  int max_window = 31;
+  LaplacianWindows<float> windows{max_window};
+  for (int window_size = 3; window_size <= max_window; window_size += 2) {
+    CheckDerivWindow(window_size, windows);
+  }
+}
+
+TEST(LaplacianWindowsTest, GetSmoothingWindows) {
+  int max_window = 31;
+  LaplacianWindows<float> windows{max_window};
+  for (int window_size = 3; window_size <= max_window; window_size += 2) {
+    CheckSmoothingWindow(window_size, windows);
+  }
+}
+
+TEST(LaplacianWindowsTest, CheckPrecomputed) {
+  int max_window = 31;
+  LaplacianWindows<float> windows{max_window};
+  for (int window_size = max_window; window_size >= 3; window_size -= 2) {
+    CheckDerivWindow(window_size, windows);
+    CheckSmoothingWindow(window_size, windows);
+  }
+}
+
+}  // namespace kernels
+}  // namespace dali
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 add_subdirectory(gaussian_blur_gpu)
+add_subdirectory(laplacian_gpu)
 
 # Get all the source files and dump test files
 collect_headers(DALI_INST_HDRS PARENT_SCOPE)
 
@@ -20,6 +20,7 @@
 
 #include "dali/core/static_switch.h"
 #include "dali/kernels/imgproc/convolution/laplacian_cpu.h"
+#include "dali/kernels/imgproc/convolution/laplacian_windows.h"
 #include "dali/kernels/kernel_manager.h"
 #include "dali/operators/image/convolution/laplacian.h"
 #include "dali/pipeline/data/views.h"
@@ -107,8 +108,12 @@ class LaplacianOpCpu : public OpImplBase<CPUBackend> {
   using Kernel = kernels::LaplacianCpu<Out, In, float, axes, has_channels>;
   static constexpr int ndim = Kernel::ndim;
 
-  explicit LaplacianOpCpu(const OpSpec& spec, const DimDesc& dim_desc)
-      : spec_{spec}, args{spec}, dim_desc_{dim_desc} {}
+  /**
+   * @param spec  Pointer to a persistent OpSpec object,
+   *              which is guaranteed to be alive for the entire lifetime of this object
+   */
+  explicit LaplacianOpCpu(const OpSpec* spec, const DimDesc& dim_desc)
+      : spec_{*spec}, args{*spec}, dim_desc_{dim_desc}, lap_windows_{maxWindowSize} {}
 
   bool SetupImpl(std::vector<OutputDesc>& output_desc, const workspace_t<CPUBackend>& ws) override {
     const auto& input = ws.template Input<CPUBackend>(0);
@@ -126,7 +131,9 @@ class LaplacianOpCpu : public OpImplBase<CPUBackend> {
       const auto& window_sizes = args.GetWindowSizes(sample_idx);
       for (int i = 0; i < axes; i++) {
         for (int j = 0; j < axes; j++) {
-          windows_[sample_idx][i][j] = lap_windows_.GetWindow(window_sizes[i][j], i == j);
+          auto window_size = window_sizes[i][j];
+          windows_[sample_idx][i][j] = i == j ? lap_windows_.GetDerivWindow(window_size) :
+                                                lap_windows_.GetSmoothingWindow(window_size);
         }
       }
     }
@@ -182,19 +189,20 @@ class LaplacianOpCpu : public OpImplBase<CPUBackend> {
 
   LaplacianArgs<axes> args;
   DimDesc dim_desc_;
+  kernels::LaplacianWindows<float> lap_windows_;
 
   kernels::KernelManager kmgr_;
   kernels::KernelContext ctx_;
 
-  LaplacianWindows<float> lap_windows_;
   // windows_[i][j] is a window used in convolution along j-th axis in the i-th partial derivative
   std::vector<std::array<std::array<TensorView<StorageCPU, const float, 1>, axes>, axes>> windows_;
 };
 
-
 }  // namespace laplacian
 
-bool Laplacian::SetupImpl(std::vector<OutputDesc>& output_desc, const workspace_t<CPUBackend>& ws) {
+template <>
+bool Laplacian<CPUBackend>::SetupImpl(std::vector<OutputDesc>& output_desc,
+                                      const workspace_t<CPUBackend>& ws) {
   const auto& input = ws.template Input<CPUBackend>(0);
   auto layout = input.GetLayout();
   auto dim_desc = ParseAndValidateDim(input.shape().sample_dim(), layout);
@@ -211,10 +219,10 @@ bool Laplacian::SetupImpl(std::vector<OutputDesc>& output_desc, const workspace_
         BOOL_SWITCH(dim_desc.is_channel_last(), HasChannels, (
           if (dtype == input.type()) {
             using LaplacianSame = laplacian::LaplacianOpCpu<In, In, Axes, HasChannels>;
-            impl_ = std::make_unique<LaplacianSame>(spec_, dim_desc);
+            impl_ = std::make_unique<LaplacianSame>(&spec_, dim_desc);
           } else {
             using LaplacianFloat = laplacian::LaplacianOpCpu<float, In, Axes, HasChannels>;
-            impl_ = std::make_unique<LaplacianFloat>(spec_, dim_desc);
+            impl_ = std::make_unique<LaplacianFloat>(&spec_, dim_desc);
           }
         )); // NOLINT
       ), DALI_FAIL("Axis count out of supported range."));  // NOLINT
@@ -224,10 +232,11 @@ bool Laplacian::SetupImpl(std::vector<OutputDesc>& output_desc, const workspace_
   return impl_->SetupImpl(output_desc, ws);
 }
 
-void Laplacian::RunImpl(workspace_t<CPUBackend>& ws) {
+template <>
+void Laplacian<CPUBackend>::RunImpl(workspace_t<CPUBackend>& ws) {
   impl_->RunImpl(ws);
 }
 
-DALI_REGISTER_OPERATOR(Laplacian, Laplacian, CPU);
+DALI_REGISTER_OPERATOR(Laplacian, Laplacian<CPUBackend>, CPU);
 
 }  // namespace dali