PaddlePaddle · luotao1 · Jan 4, 2024 · Sep 30, 2023 · Oct 7, 2023 · Oct 9, 2023
diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h
@@ -144,6 +144,7 @@ using experimental::split;
 using experimental::sqrt;
 using experimental::square;
 using experimental::stack;
+using experimental::standard_gamma;
 using experimental::strided_slice;
 using experimental::subtract;
 using experimental::swish;

diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -2538,6 +2538,14 @@
     func : stack
   backward : stack_grad
 
+- op : standard_gamma
+  args : (Tensor x)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+  kernel :
+    func : standard_gamma
+
 - op : stanh
   args : (Tensor x, float scale_a=0.67f, float scale_b=1.7159f)
   output : Tensor(out)

diff --git a/paddle/phi/kernels/cpu/dirichlet_kernel.cc b/paddle/phi/kernels/cpu/dirichlet_kernel.cc
@@ -13,90 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/dirichlet_kernel_impl.h"
 
-namespace phi {
-
-template <typename T, typename UniformSamplerT, typename NormalSamplerT>
-struct GammaCPUFunctor {
-  GammaCPUFunctor(const T* alpha,
-                  T* gamma,
-                  BaseSampler<T, UniformSamplerT> uniform,
-                  BaseSampler<T, NormalSamplerT> normal)
-      : alpha_(alpha), gamma_(gamma), uniform_(uniform), normal_(normal) {}
-
-  HOST void operator()(int64_t index) {
-    auto sample = sample_gamma<T, T, UniformSamplerT, NormalSamplerT>(
-        alpha_[index], uniform_, normal_);
-    gamma_[index] = std::max(std::numeric_limits<T>::min(), sample);
-  }
-
-  const T* alpha_;
-  T* gamma_;
-  BaseSampler<T, UniformSamplerT> uniform_;
-  BaseSampler<T, NormalSamplerT> normal_;
-};
-
-template <typename T>
-struct DirichletSampler<CPUContext, T> {
-  void operator()(const CPUContext& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out) {
-    auto generator = dev_ctx.GetGenerator()->GetCPUEngine();
-
-    auto uniform = [&generator]() -> T {
-      std::uniform_real_distribution<T> u(0.0, 1.0);
-      return u(*generator);
-    };
-    BaseSampler<T, decltype(uniform)> standard_uniform(uniform);
-
-    auto normal = [&generator]() {
-      std::normal_distribution<T> n(0.0, 1.0);
-      return n(*generator);
-    };
-    BaseSampler<T, decltype(normal)> standard_normal(normal);
-
-    // sample from K gamma distributions, where K=alpha.numel()
-    DenseTensor gamma_samples;
-    gamma_samples.Resize(alpha.dims());
-    dev_ctx.template Alloc<T>(&gamma_samples);
-
-    GammaCPUFunctor<T, decltype(uniform), decltype(normal)> gamma_functor(
-        alpha.data<T>(),
-        gamma_samples.data<T>(),
-        standard_uniform,
-        standard_normal);
-    funcs::ForRange<CPUContext> for_range(dev_ctx, alpha.numel());
-    for_range(gamma_functor);
-
-    // normalize them into a simplex, along the last axis
-    DenseTensor gamma_sum;
-    auto new_shape = gamma_samples.dims();
-    new_shape[new_shape.size() - 1] = 1;
-    gamma_sum.Resize(new_shape);
-    dev_ctx.template Alloc<T>(&gamma_sum);
-
-    funcs::ReduceKernelImpl<CPUContext, T, T, funcs::SumFunctor>(
-        dev_ctx,
-        gamma_samples,
-        &gamma_sum,
-        {new_shape.size() - 1},
-        true,
-        false);
-
-    funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-        dev_ctx, gamma_samples, gamma_sum, funcs::DivideFunctor<T>(), out);
-  }
-};
-
-}  // namespace phi
-
 PD_REGISTER_KERNEL(
     dirichlet, CPU, ALL_LAYOUT, phi::Dirichletkernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/standard_gamma_kernel.cc b/paddle/phi/kernels/cpu/standard_gamma_kernel.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/standard_gamma_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    standard_gamma, CPU, ALL_LAYOUT, phi::StandardGammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
@@ -1,5 +1,3 @@
-
-
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,102 +14,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_divide_kernel.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
-#include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
 #include "paddle/phi/kernels/impl/dirichlet_kernel_impl.h"
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
-
-#ifdef PADDLE_WITH_CUDA
-#include <curand_kernel.h>
-#endif
-#ifdef PADDLE_WITH_HIP
-#include <hiprand_kernel.h>
-#endif
-
-#if defined(PADDLE_WITH_CUDA)
-using COMPAT_RANDSTATEPHILOX4_32_10_T = curandStatePhilox4_32_10_t;
-#define COMPAT_RAND_INIT curand_init
-#define COMPAT_RAND_UNIFORM curand_uniform
-#define COMPAT_RAND_NORMAL curand_normal
-#elif defined(PADDLE_WITH_HIP)
-using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
-#define COMPAT_RAND_INIT hiprand_init
-#define COMPAT_RAND_UNIFORM hiprand_uniform
-#define COMPAT_RAND_NORMAL hiprand_normal
-#endif
-
-namespace phi {
-template <typename T>
-struct GammaCUDAFunctor {
-  GammaCUDAFunctor(const T* alpha, T* gamma, uint64_t seed, uint64_t offset)
-      : alpha_(alpha), gamma_(gamma), seed_(seed), offset_(offset) {}
-
-  DEVICE void operator()(int64_t index) {
-    // curand initialization
-    COMPAT_RANDSTATEPHILOX4_32_10_T state;
-    COMPAT_RAND_INIT(
-        /*seed=*/seed_, /*subsequence=*/index, /*offset=*/offset_, &state);
-
-    // sample
-    auto uniform_lambda = [&state]() { return COMPAT_RAND_UNIFORM(&state); };
-    BaseSampler<T, decltype(uniform_lambda)> standard_uniform(uniform_lambda);
-    auto normal_lambda = [&state]() { return COMPAT_RAND_NORMAL(&state); };
-    BaseSampler<T, decltype(normal_lambda)> standard_normal(normal_lambda);
-
-    auto sample =
-        sample_gamma<T, T, decltype(uniform_lambda), decltype(normal_lambda)>(
-            alpha_[index], standard_uniform, standard_normal);
-    gamma_[index] = std::max(std::numeric_limits<T>::min(), sample);
-  }
-
-  const T* alpha_;
-  T* gamma_;
-  const uint64_t seed_;
-  const uint64_t offset_;
-};
-
-template <typename T>
-struct DirichletSampler<GPUContext, T> {
-  void operator()(const GPUContext& dev_ctx,
-                  const DenseTensor& alpha,
-                  DenseTensor* out) {
-    auto p_gen = dev_ctx.GetGenerator();
-    auto seed_and_offset = p_gen->IncrementOffset(10);  // hard-coded offset
-    auto seed = seed_and_offset.first;
-    auto offset = seed_and_offset.second;
-
-    // sample from K gamma distributions, where K=alpha.numel()
-    DenseTensor gamma_samples;
-    gamma_samples.Resize(alpha.dims());
-    dev_ctx.template Alloc<T>(&gamma_samples);
-
-    GammaCUDAFunctor<T> gamma_functor(
-        alpha.data<T>(), gamma_samples.data<T>(), seed, offset);
-    funcs::ForRange<GPUContext> for_range(dev_ctx, out->numel());
-    for_range(gamma_functor);
-
-    // normalize them into a simplex, along the last axis
-    DenseTensor gamma_sum;
-    auto new_shape = gamma_samples.dims();
-    new_shape[new_shape.size() - 1] = 1;
-    gamma_sum.Resize(new_shape);
-    dev_ctx.template Alloc<T>(&gamma_sum);
-
-    phi::SumRawKernel<T, GPUContext>(dev_ctx,
-                                     gamma_samples,
-                                     {new_shape.size() - 1},
-                                     true,
-                                     false,
-                                     gamma_sum.dtype(),
-                                     &gamma_sum);
-    phi::DivideKernel<T, GPUContext>(dev_ctx, gamma_samples, gamma_sum, out);
-  }
-};
-}  // namespace phi
 
 PD_REGISTER_KERNEL(dirichlet,
                    GPU,

diff --git a/paddle/phi/kernels/gpu/standard_gamma_kernel.cu b/paddle/phi/kernels/gpu/standard_gamma_kernel.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/standard_gamma_kernel_impl.h"
+
+PD_REGISTER_KERNEL(standard_gamma,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::StandardGammaKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}