rapidsai · cjnolet · Jun 30, 2019 · Jun 12, 2019 · Jun 12, 2019 · Jun 12, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## New Features
 
+- PR #674: KL Divergence metric ml-prim
 - PR #652: Adjusted Rand Index metric ml-prim
 - PR #679: Class label manipulation ml-prim
 - PR #636: Rand Index metric ml-prim

diff --git a/cpp/src/metrics/metrics.cu b/cpp/src/metrics/metrics.cu
@@ -19,6 +19,7 @@
 #include "metrics.hpp"
 
 #include "metrics/adjustedRandIndex.h"
+#include "metrics/klDivergence.h"
 #include "metrics/randIndex.h"
 #include "score/scores.h"
 
@@ -49,5 +50,17 @@ double adjustedRandIndex(const cumlHandle &handle, const int *y,
     handle.getDeviceAllocator(), handle.getStream());
 }
 
+double klDivergence(const cumlHandle &handle, const double *y,
+                    const double *y_hat, int n) {
+  return MLCommon::Metrics::klDivergence(
+    y, y_hat, n, handle.getDeviceAllocator(), handle.getStream());
+}
+
+float klDivergence(const cumlHandle &handle, const float *y, const float *y_hat,
+                   int n) {
+  return MLCommon::Metrics::klDivergence(
+    y, y_hat, n, handle.getDeviceAllocator(), handle.getStream());
+}
+
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src/metrics/metrics.hpp b/cpp/src/metrics/metrics.hpp
@@ -23,72 +23,104 @@ namespace ML {
 namespace Metrics {
 
 /**
-         * Calculates the "Coefficient of Determination" (R-Squared) score
-         * normalizing the sum of squared errors by the total sum of squares
-         * with single precision.
-         *
-         * This score indicates the proportionate amount of variation in an
-         * expected response variable is explained by the independent variables
-         * in a linear regression model. The larger the R-squared value, the
-         * more variability is explained by the linear regression model.
-         *
-         * @param handle: cumlHandle
-         * @param y: Array of ground-truth response variables
-         * @param y_hat: Array of predicted response variables
-         * @param n: Number of elements in y and y_hat
-         * @return: The R-squared value.
-         */
+* Calculates the "Coefficient of Determination" (R-Squared) score
+* normalizing the sum of squared errors by the total sum of squares
+* with single precision.
+*
+* This score indicates the proportionate amount of variation in an
+* expected response variable is explained by the independent variables
+* in a linear regression model. The larger the R-squared value, the
+* more variability is explained by the linear regression model.
+*
+* @param handle: cumlHandle
+* @param y: Array of ground-truth response variables
+* @param y_hat: Array of predicted response variables
+* @param n: Number of elements in y and y_hat
+* @return: The R-squared value.
+*/
 float r2_score_py(const cumlHandle &handle, float *y, float *y_hat, int n);
 
 /**
-         * Calculates the "Coefficient of Determination" (R-Squared) score
-         * normalizing the sum of squared errors by the total sum of squares
-         * with double precision.
-         *
-         * This score indicates the proportionate amount of variation in an
-         * expected response variable is explained by the independent variables
-         * in a linear regression model. The larger the R-squared value, the
-         * more variability is explained by the linear regression model.
-         *
-         * @param handle: cumlHandle
-         * @param y: Array of ground-truth response variables
-         * @param y_hat: Array of predicted response variables
-         * @param n: Number of elements in y and y_hat
-         * @return: The R-squared value.
-         */
+* Calculates the "Coefficient of Determination" (R-Squared) score
+* normalizing the sum of squared errors by the total sum of squares
+* with double precision.
+*
+* This score indicates the proportionate amount of variation in an
+* expected response variable is explained by the independent variables
+* in a linear regression model. The larger the R-squared value, the
+* more variability is explained by the linear regression model.
+*
+* @param handle: cumlHandle
+* @param y: Array of ground-truth response variables
+* @param y_hat: Array of predicted response variables
+* @param n: Number of elements in y and y_hat
+* @return: The R-squared value.
+*/
 double r2_score_py(const cumlHandle &handle, double *y, double *y_hat, int n);
 
 /**
-         * Calculates the "rand index"
-         *
-         * This metric is a measure of similarity between two data clusterings.
-         *
-         * @param handle: cumlHandle
-         * @param y: Array of response variables of the first clustering classifications
-         * @param y_hat: Array of response variables of the second clustering classifications
-         * @param n: Number of elements in y and y_hat
-         * @return: The rand index value
-         */
+* Calculates the "rand index"
+*
+* This metric is a measure of similarity between two data clusterings.
+*
+* @param handle: cumlHandle
+* @param y: Array of response variables of the first clustering classifications
+* @param y_hat: Array of response variables of the second clustering classifications
+* @param n: Number of elements in y and y_hat
+* @return: The rand index value
+*/
 
 double randIndex(const cumlHandle &handle, double *y, double *y_hat, int n);
 
 /**
-         * Calculates the "adjusted rand index"
-         *
-         * This metric is the corrected-for-chance version of the rand index 
-         *
-         * @param handle: cumlHandle
-         * @param y: Array of response variables of the first clustering classifications
-         * @param y_hat: Array of response variables of the second clustering classifications
-         * @param n: Number of elements in y and y_hat
-         * @param lower_class_range: the lowest value in the range of classes
-         * @param upper_class_range: the highest value in the range of classes
-         * @return: The adjusted rand index value
-         */
+* Calculates the "adjusted rand index"
+*
+* This metric is the corrected-for-chance version of the rand index
+*
+* @param handle: cumlHandle
+* @param y: Array of response variables of the first clustering classifications
+* @param y_hat: Array of response variables of the second clustering classifications
+* @param n: Number of elements in y and y_hat
+* @param lower_class_range: the lowest value in the range of classes
+* @param upper_class_range: the highest value in the range of classes
+* @return: The adjusted rand index value
+*/
 double adjustedRandIndex(const cumlHandle &handle, const int *y,
                          const int *y_hat, const int n,
                          const int lower_class_range,
                          const int upper_class_range);
 
+/**
+* Calculates the "Kullback-Leibler Divergence"
+*
+* The KL divergence tells us how well the probability distribution Q
+* approximates the probability distribution P
+* It is often also used as a 'distance metric' between two probablity ditributions (not symmetric)
+*
+* @param handle: cumlHandle
+* @param y: Array of probabilities corresponding to distribution P
+* @param y_hat: Array of probabilities corresponding to distribution Q
+* @param n: Number of elements in y and y_hat
+* @return: The KL Divergence value
+*/
+double klDivergence(const cumlHandle &handle, const double *y,
+                    const double *y_hat, int n);
+
+/**
+* Calculates the "Kullback-Leibler Divergence"
+*
+* The KL divergence tells us how well the probability distribution Q
+* approximates the probability distribution P
+* It is often also used as a 'distance metric' between two probablity ditributions (not symmetric)
+*
+* @param handle: cumlHandle
+* @param y: Array of probabilities corresponding to distribution P
+* @param y_hat: Array of probabilities corresponding to distribution Q
+* @param n: Number of elements in y and y_hat
+* @return: The KL Divergence value
+*/
+float klDivergence(const cumlHandle &handle, const float *y, const float *y_hat,
+                   int n);
+
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src_prims/metrics/klDivergence.h b/cpp/src_prims/metrics/klDivergence.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+* @file klDivergence.h
+* @brief The KL divergence tells us how well the probability distribution Q AKA candidatePDF 
+* approximates the probability distribution P AKA modelPDF.
+*/
+
+#include <math.h>
+#include "common/cuml_allocator.hpp"
+#include "common/device_buffer.hpp"
+#include "cuda_utils.h"
+#include "linalg/map_then_reduce.h"
+
+namespace MLCommon {
+
+/**
+* @brief the KL Diverence mapping function
+*
+* @tparam Type: Data type of the input 
+* @param modelPDF: the model probability density function of type DataT
+* @param candidatePDF: the candidate probability density function of type DataT
+*/
+template <typename Type>
+struct KLDOp {
+  HDI Type operator()(Type modelPDF, Type candidatePDF) {
+    if (modelPDF == 0.0)
+      return 0;
+
+    else
+      return modelPDF * (log(modelPDF) - log(candidatePDF));
+  }
+};
+
+namespace Metrics {
+
+/**
+* @brief Function to calculate KL Divergence
+* <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL Divergence</a> 
+*
+* @tparam DataT: Data type of the input array
+* @param modelPDF: the model array of probability density functions of type DataT
+* @param candidatePDF: the candidate array of probability density functions of type DataT
+* @param size: the size of the data points of type int
+* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr<MLCommon::deviceAllocator>
+* @param stream: the cudaStream object
+*/
+template <typename DataT>
+DataT klDivergence(const DataT* modelPDF, const DataT* candidatePDF, int size,
+                   std::shared_ptr<MLCommon::deviceAllocator> allocator,
+                   cudaStream_t stream) {
+  MLCommon::device_buffer<DataT> d_KLDVal(allocator, stream, 1);
+  CUDA_CHECK(cudaMemsetAsync(d_KLDVal.data(), 0, sizeof(DataT), stream));
+
+  MLCommon::LinAlg::mapThenSumReduce<DataT, KLDOp<DataT>, 256, const DataT*>(
+    d_KLDVal.data(), (size_t)size, KLDOp<DataT>(), stream, modelPDF,
+    candidatePDF);
+
+  DataT h_KLDVal;
+
+  MLCommon::updateHost(&h_KLDVal, d_KLDVal.data(), 1, stream);
+
+  CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  return h_KLDVal;
+}
+
+};  //end namespace Metrics
+};  //end namespace MLCommon
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
@@ -151,6 +151,7 @@ if(BUILD_PRIMS_TESTS)
       prims/grid_sync.cu
       prims/hinge.cu
       prims/host_buffer.cu
+      prims/klDivergence.cu
       prims/knn.cu
       prims/kselection.cu
       prims/label.cu

diff --git a/cpp/test/prims/klDivergence.cu b/cpp/test/prims/klDivergence.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include "common/cuml_allocator.hpp"
+#include "metrics/klDivergence.h"
+#include "test_utils.h"
+
+namespace MLCommon {
+namespace Metrics {
+
+//parameter structure definition
+struct klDivergenceParam {
+  int nElements;
+  double tolerance;
+};
+
+//test fixture class
+template <typename DataT>
+class klDivergenceTest : public ::testing::TestWithParam<klDivergenceParam> {
+ protected:
+  //the constructor
+  void SetUp() override {
+    //getting the parameters
+    params = ::testing::TestWithParam<klDivergenceParam>::GetParam();
+
+    nElements = params.nElements;
+
+    //generating random value test input
+    std::vector<DataT> h_modelPDF(nElements, 0);
+    std::vector<DataT> h_candidatePDF(nElements, 0);
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    std::uniform_real_distribution<DataT> realGenerator(0.0, 1.0);
+
+    std::generate(h_modelPDF.begin(), h_modelPDF.end(),
+                  [&]() { return realGenerator(dre); });
+    std::generate(h_candidatePDF.begin(), h_candidatePDF.end(),
+                  [&]() { return realGenerator(dre); });
+
+    //allocating and initializing memory to the GPU
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    MLCommon::allocate(d_modelPDF, nElements, true);
+    MLCommon::allocate(d_candidatePDF, nElements, true);
+
+    MLCommon::updateDevice(d_modelPDF, &h_modelPDF[0], (int)nElements, stream);
+    MLCommon::updateDevice(d_candidatePDF, &h_candidatePDF[0], (int)nElements,
+                           stream);
+    std::shared_ptr<MLCommon::deviceAllocator> allocator(
+      new defaultDeviceAllocator);
+
+    //generating the golden output
+    for (int i = 0; i < nElements; ++i) {
+      if (h_modelPDF[i] == 0.0)
+        truthklDivergence += 0;
+
+      else
+        truthklDivergence +=
+          h_modelPDF[i] * log(h_modelPDF[i] / h_candidatePDF[i]);
+    }
+
+    //calling the klDivergence CUDA implementation
+    computedklDivergence = MLCommon::Metrics::klDivergence(
+      d_modelPDF, d_candidatePDF, nElements, allocator, stream);
+  }
+
+  //the destructor
+  void TearDown() override {
+    CUDA_CHECK(cudaFree(d_modelPDF));
+    CUDA_CHECK(cudaFree(d_candidatePDF));
+    CUDA_CHECK(cudaStreamDestroy(stream));
+  }
+
+  //declaring the data values
+  klDivergenceParam params;
+  DataT* d_modelPDF = nullptr;
+  DataT* d_candidatePDF = nullptr;
+  int nElements = 0;
+  DataT truthklDivergence = 0;
+  DataT computedklDivergence = 0;
+  cudaStream_t stream;
+};
+
+//setting test parameter values
+const std::vector<klDivergenceParam> inputs = {
+  {500, 0.000001}, {200, 0.001}, {5000, 0.000001}, {500000, 0.000001}
+
+};
+
+//writing the test suite
+typedef klDivergenceTest<double> klDivergenceTestClass;
+TEST_P(klDivergenceTestClass, Result) {
+  ASSERT_NEAR(computedklDivergence, truthklDivergence, params.tolerance);
+}
+INSTANTIATE_TEST_CASE_P(klDivergence, klDivergenceTestClass,
+                        ::testing::ValuesIn(inputs));
+
+}  //end namespace Metrics
+}  //end namespace MLCommon