Yelp · jialeiwang · Aug 1, 2014 · Jun 11, 2014 · Jun 12, 2014 · Jun 12, 2014
diff --git a/moe/optimal_learning/cpp/CMakeLists.txt b/moe/optimal_learning/cpp/CMakeLists.txt
@@ -73,6 +73,7 @@ set(OPTIMAL_LEARNING_CORE_SOURCES
   gpp_math.cpp
   gpp_model_selection.cpp
   gpp_random.cpp
+  gpp_expected_improvement_gpu.cpp
   )
 
 # readonly
@@ -88,6 +89,7 @@ set(OPTIMAL_LEARNING_TEST_SOURCES
   gpp_random_test.cpp
   gpp_test_utils.cpp
   gpp_test_utils_test.cpp
+  gpp_expected_improvement_gpu_test.cpp
   )
 
 # readonly
@@ -228,6 +230,11 @@ function(configure_exec_targets exec_names exec_sources dependencies compile_fla
       COMPILE_DEFINITIONS "${compile_definitions}"
       LINK_FLAGS "${EXTRA_LINK_FLAGS}"
       )
+    if (${MOE_USE_GPU} MATCHES "1")
+        add_dependencies(${name} GPU_LIB)
+        target_link_libraries(${name} ${CUDA_LIBRARIES}
+            ${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so)
+    endif()
   endforeach()
 endfunction(configure_exec_targets)
 
@@ -258,6 +265,50 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fopenmp -Wall -Wextra ${EXTR
 # O3 takes longer to compile and the code produced is at best no faster than O2 (gcc, icc).
 string(REGEX REPLACE "O3" "O2" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
 
+#### GPU Component
+# readonly
+set(EXTRA_COMPILE_DEFINITIONS_GPU OL_GPU_ENABLED)
+
+# CUDA C compiler may be different from the c/c++ compiler used for compiling cpp
+# code, because current available CUDA version does not support gcc 4.7+
+set(GPU_CC ${CMAKE_C_COMPILER})
+set(GPU_CXX ${CMAKE_CXX_COMPILER})
+
+if (EXISTS ${MOE_GPU_CC})
+    set(GPU_CC ${MOE_GPU_CC})
+endif()
+if (EXISTS ${MOE_GPU_CXX})
+    set(GPU_CXX ${MOE_GPU_CXX})
+endif()
+
+# If MOE_USE_GPU is turned on via MOE_CMAKE_OPTS, cmake will try to find CUDA
+# package and call gpu cmake to build CUDA code
+if (${MOE_USE_GPU} MATCHES "1")
+    if (NOT (EXISTS ${MOE_CUDA_SDK_INCLUDE_DIRS}))
+        message( FATAL_ERROR "MOE_CUDA_SDK_INCLUDE_DIRS not set!" )
+    endif()
+    find_package(CUDA 5.0 REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    set(EXTRA_COMPILE_DEFINITIONS ${EXTRA_COMPILE_DEFINITIONS}
+       ${EXTRA_COMPILE_DEFINITIONS_GPU})
+    set(MOE_GPU_CMAKE_OPTS
+        "-D MOE_CUDA_SDK_INCLUDE_DIRS=${MOE_CUDA_SDK_INCLUDE_DIRS}")
+    add_custom_target(
+        GPU_FOLDER
+        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+        COMMAND mkdir -p ${CMAKE_BINARY_DIR}/gpu/
+        )
+    add_custom_target(
+        GPU_LIB
+        DEPENDS GPU_FOLDER
+        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/gpu/
+        COMMAND CC=${GPU_CC} CXX=${GPU_CXX} cmake `echo
+        ${MOE_GPU_CMAKE_OPTS}` ${CMAKE_SOURCE_DIR}/gpu/
+        COMMAND make
+        )
+endif()
+
+
 #### Object libraries
 # See configure_object_library() function comments for more details.
 # WARNING: You MUST have compatible flags set between OBJECT libraries and targets that depend on them!
@@ -299,9 +350,15 @@ add_library(
   $<TARGET_OBJECTS:OPTIMAL_LEARNING_TEST_BUNDLE>
   ${CMAKE_BINARY_DIR}/__init__.py
   )
+if (${MOE_USE_GPU} MATCHES "1")
+    add_dependencies(GPP GPU_LIB)
+endif()
 
 target_link_libraries(GPP ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
-
+if (${MOE_USE_GPU} MATCHES "1")
+    target_link_libraries(GPP ${CUDA_LIBRARIES} ${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so)
+endif()
+
 # Make sure lib ends in ".so"
 set_target_properties(
   GPP PROPERTIES

diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp
@@ -0,0 +1,206 @@
+/*!
+  \file gpp_expected_improvement_gpu.cpp
+  \rst
+  This file contains implementations of GPU related functions. They are actually C++ wrappers for
+  CUDA C functions defined in gpu/gpp_cuda_math.cu.
+\endrst*/
+
+#include "gpp_expected_improvement_gpu.hpp"
+
+#include <algorithm>
+#include <vector>
+
+#include "gpp_common.hpp"
+#include "gpp_exception.hpp"
+#include "gpp_logging.hpp"
+#include "gpp_math.hpp"
+#include "gpp_random.hpp"
+
+#ifdef OL_GPU_ENABLED
+
+#include "gpu/gpp_cuda_math.hpp"
+#include "driver_types.h"
+#include "cuda_runtime.h"
+
+#endif
+
+namespace optimal_learning {
+
+#ifdef OL_GPU_ENABLED
+
+inline int get_vector_size(int num_mc_itr, int num_threads, int num_blocks, int num_points) {
+  return ((static_cast<int>(num_mc_itr / (num_threads * num_blocks)) + 1) * (num_threads * num_blocks) * num_points);
+}
+
+CudaDevicePointer::CudaDevicePointer(int num_doubles_in) : num_doubles(num_doubles_in) {
+  if (num_doubles_in > 0) {
+      CudaError _err = CudaAllocateMemForDoubleVector(num_doubles, &ptr);
+      if (_err.err != cudaSuccess) {
+          ptr = nullptr;
+          ThrowException(OptimalLearningCudaException(_err));
+      }
+  } else {
+      ptr = nullptr;
+  }
+}
+
+CudaDevicePointer::~CudaDevicePointer() {
+    CudaFreeMem(ptr);
+}
+
+OptimalLearningCudaException::OptimalLearningCudaException(const CudaError& _err)
+      : OptimalLearningException(_err.file_and_line_info, _err.func_info, cudaGetErrorString(_err.err)) {}
+
+double CudaExpectedImprovementEvaluator::ComputeExpectedImprovement(StateType * ei_state) const {
+  double EI_val;
+  int num_union = ei_state->num_union;
+  gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data());
+  gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data());
+  int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data());
+  if (unlikely(leading_minor_index != 0)) {
+    OL_THROW_EXCEPTION(SingularMatrixException, "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.", ei_state->cholesky_to_sample_var.data(),
+                       num_union, leading_minor_index);
+  }
+  unsigned int seed_in = (ei_state->uniform_rng->GetEngine())();
+  OL_CUDA_ERROR_THROW(CudaGetEI(ei_state->to_sample_mean.data(), ei_state->cholesky_to_sample_var.data(),
+                             best_so_far_, num_union, ei_state->gpu_mu.ptr, ei_state->gpu_chol_var.ptr,
+                             ei_state->gpu_ei_storage.ptr, seed_in, num_mc_, &EI_val,
+                             ei_state->gpu_random_number_ei.ptr, ei_state->random_number_ei.data(),
+                             ei_state->configure_for_test));
+  return EI_val;
+}
+
+void CudaExpectedImprovementEvaluator::ComputeGradExpectedImprovement(StateType * ei_state,
+                                                                      double * restrict grad_ei) const {
+  if (ei_state->num_derivatives == 0) {
+    OL_THROW_EXCEPTION(OptimalLearningException, "configure_for_gradients set to false, gradient computation is disabled!");
+  }
+  const int num_union = ei_state->num_union;
+  const int num_to_sample = ei_state->num_to_sample;
+  gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data());
+  gaussian_process_->ComputeGradMeanOfPoints(ei_state->points_to_sample_state, ei_state->grad_mu.data());
+  gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data());
+  int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data());
+  if (unlikely(leading_minor_index != 0)) {
+    OL_THROW_EXCEPTION(SingularMatrixException, "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.", ei_state->cholesky_to_sample_var.data(),
+                       num_union, leading_minor_index);
+  }
+
+  gaussian_process_->ComputeGradCholeskyVarianceOfPoints(&(ei_state->points_to_sample_state),
+                                                         ei_state->cholesky_to_sample_var.data(),
+                                                         ei_state->grad_chol_decomp.data());
+  unsigned int seed_in = (ei_state->uniform_rng->GetEngine())();
+
+  OL_CUDA_ERROR_THROW(CudaGetGradEI(ei_state->to_sample_mean.data(), ei_state->grad_mu.data(),
+                                 ei_state->cholesky_to_sample_var.data(), ei_state->grad_chol_decomp.data(),
+                                 best_so_far_, num_union, num_to_sample, dim_,
+                                 (ei_state->gpu_mu).ptr, (ei_state->gpu_grad_mu).ptr, (ei_state->gpu_chol_var).ptr,
+                                 (ei_state->gpu_grad_chol_var).ptr, (ei_state->gpu_grad_ei_storage).ptr,
+                                 seed_in, num_mc_, grad_ei, ei_state->gpu_random_number_grad_ei.ptr,
+                                 ei_state->random_number_grad_ei.data(), ei_state->configure_for_test));
+}
+
+void CudaExpectedImprovementEvaluator::SetupGPU(int devID) {
+  OL_CUDA_ERROR_THROW(CudaSetDevice(devID));
+}
+
+CudaExpectedImprovementEvaluator::CudaExpectedImprovementEvaluator(const GaussianProcess& gaussian_process_in,
+                                   int num_mc_in, double best_so_far, int devID_in)
+      : dim_(gaussian_process_in.dim()),
+        num_mc_(num_mc_in),
+        best_so_far_(best_so_far),
+        gaussian_process_(&gaussian_process_in) {
+    SetupGPU(devID_in);
+  }
+
+CudaExpectedImprovementEvaluator::~CudaExpectedImprovementEvaluator() {
+  cudaDeviceReset();
+}
+
+CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator,
+                                                           double const * restrict points_to_sample,
+                                                           double const * restrict points_being_sampled,
+                                                           int num_to_sample_in, int num_being_sampled_in,
+                                                           bool configure_for_gradients,
+                                                           UniformRandomGenerator * uniform_rng_in)
+    : dim(ei_evaluator.dim()),
+      num_to_sample(num_to_sample_in),
+      num_being_sampled(num_being_sampled_in),
+      num_derivatives(configure_for_gradients ? num_to_sample : 0),
+      num_union(num_to_sample + num_being_sampled),
+      union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)),
+      points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives),
+      uniform_rng(uniform_rng_in),
+      to_sample_mean(num_union),
+      grad_mu(dim*num_derivatives),
+      cholesky_to_sample_var(Square(num_union)),
+      grad_chol_decomp(dim*Square(num_union)*num_derivatives),
+      configure_for_test(false),
+      gpu_mu(num_union),
+      gpu_chol_var(Square(num_union)),
+      gpu_grad_mu(dim * num_derivatives),
+      gpu_grad_chol_var(dim * Square(num_union) * num_derivatives),
+      gpu_ei_storage(kEINumThreads * kEINumBlocks),
+      gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives),
+      gpu_random_number_ei(0),
+      gpu_random_number_grad_ei(0),
+      random_number_ei(0),
+      random_number_grad_ei(0) {
+}
+
+CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator,
+                                                           double const * restrict points_to_sample,
+                                                           double const * restrict points_being_sampled,
+                                                           int num_to_sample_in, int num_being_sampled_in,
+                                                           bool configure_for_gradients,
+                                                           UniformRandomGenerator * uniform_rng_in,
+                                                           bool configure_for_test_in)
+    : dim(ei_evaluator.dim()),
+      num_to_sample(num_to_sample_in),
+      num_being_sampled(num_being_sampled_in),
+      num_derivatives(configure_for_gradients ? num_to_sample : 0),
+      num_union(num_to_sample + num_being_sampled),
+      union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)),
+      points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives),
+      uniform_rng(uniform_rng_in),
+      to_sample_mean(num_union),
+      grad_mu(dim*num_derivatives),
+      cholesky_to_sample_var(Square(num_union)),
+      grad_chol_decomp(dim*Square(num_union)*num_derivatives),
+      configure_for_test(configure_for_test_in),
+      gpu_mu(num_union),
+      gpu_chol_var(Square(num_union)),
+      gpu_grad_mu(dim * num_derivatives),
+      gpu_grad_chol_var(dim * Square(num_union) * num_derivatives),
+      gpu_ei_storage(kEINumThreads * kEINumBlocks),
+      gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives),
+      gpu_random_number_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0),
+      gpu_random_number_grad_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0),
+      random_number_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0),
+      random_number_grad_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0) {
+}
+
+std::vector<double> CudaExpectedImprovementState::BuildUnionOfPoints(double const * restrict points_to_sample, double const * restrict points_being_sampled,
+                                                int num_to_sample, int num_being_sampled, int dim) noexcept {
+  std::vector<double> union_of_points(dim*(num_to_sample + num_being_sampled));
+  std::copy(points_to_sample, points_to_sample + dim*num_to_sample, union_of_points.data());
+  std::copy(points_being_sampled, points_being_sampled + dim*num_being_sampled, union_of_points.data() + dim*num_to_sample);
+  return union_of_points;
+}
+
+void CudaExpectedImprovementState::UpdateCurrentPoint(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) {
+  // update points_to_sample in union_of_points
+  std::copy(points_to_sample, points_to_sample + num_to_sample*dim, union_of_points.data());
+
+  // evaluate derived quantities for the GP
+  points_to_sample_state.SetupState(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives);
+}
+
+void CudaExpectedImprovementState::SetupState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) {
+  // update quantities derived from points_to_sample
+  UpdateCurrentPoint(ei_evaluator, points_to_sample);
+}
+#endif  // OL_GPU_ENABLED
+
+}  // end namespace optimal_learning
+