diff --git a/moe/optimal_learning/cpp/CMakeLists.txt b/moe/optimal_learning/cpp/CMakeLists.txt index 3affba17..d3df48ab 100644 --- a/moe/optimal_learning/cpp/CMakeLists.txt +++ b/moe/optimal_learning/cpp/CMakeLists.txt @@ -73,6 +73,7 @@ set(OPTIMAL_LEARNING_CORE_SOURCES gpp_math.cpp gpp_model_selection.cpp gpp_random.cpp + gpp_expected_improvement_gpu.cpp ) # readonly @@ -88,6 +89,7 @@ set(OPTIMAL_LEARNING_TEST_SOURCES gpp_random_test.cpp gpp_test_utils.cpp gpp_test_utils_test.cpp + gpp_expected_improvement_gpu_test.cpp ) # readonly @@ -228,6 +230,11 @@ function(configure_exec_targets exec_names exec_sources dependencies compile_fla COMPILE_DEFINITIONS "${compile_definitions}" LINK_FLAGS "${EXTRA_LINK_FLAGS}" ) + if (${MOE_USE_GPU} MATCHES "1") + add_dependencies(${name} GPU_LIB) + target_link_libraries(${name} ${CUDA_LIBRARIES} + ${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so) + endif() endforeach() endfunction(configure_exec_targets) @@ -258,6 +265,50 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fopenmp -Wall -Wextra ${EXTR # O3 takes longer to compile and the code produced is at best no faster than O2 (gcc, icc). string(REGEX REPLACE "O3" "O2" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) +#### GPU Component +# readonly +set(EXTRA_COMPILE_DEFINITIONS_GPU OL_GPU_ENABLED) + +# CUDA C compiler may be different from the c/c++ compiler used for compiling cpp +# code, because current available CUDA version does not support gcc 4.7+ +set(GPU_CC ${CMAKE_C_COMPILER}) +set(GPU_CXX ${CMAKE_CXX_COMPILER}) + +if (EXISTS ${MOE_GPU_CC}) + set(GPU_CC ${MOE_GPU_CC}) +endif() +if (EXISTS ${MOE_GPU_CXX}) + set(GPU_CXX ${MOE_GPU_CXX}) +endif() + +# If MOE_USE_GPU is turned on via MOE_CMAKE_OPTS, cmake will try to find CUDA +# package and call gpu cmake to build CUDA code +if (${MOE_USE_GPU} MATCHES "1") + if (NOT (EXISTS ${MOE_CUDA_SDK_INCLUDE_DIRS})) + message( FATAL_ERROR "MOE_CUDA_SDK_INCLUDE_DIRS not set!" ) + endif() + find_package(CUDA 5.0 REQUIRED) + include_directories(${CUDA_INCLUDE_DIRS}) + set(EXTRA_COMPILE_DEFINITIONS ${EXTRA_COMPILE_DEFINITIONS} + ${EXTRA_COMPILE_DEFINITIONS_GPU}) + set(MOE_GPU_CMAKE_OPTS + "-D MOE_CUDA_SDK_INCLUDE_DIRS=${MOE_CUDA_SDK_INCLUDE_DIRS}") + add_custom_target( + GPU_FOLDER + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + COMMAND mkdir -p ${CMAKE_BINARY_DIR}/gpu/ + ) + add_custom_target( + GPU_LIB + DEPENDS GPU_FOLDER + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/gpu/ + COMMAND CC=${GPU_CC} CXX=${GPU_CXX} cmake `echo + ${MOE_GPU_CMAKE_OPTS}` ${CMAKE_SOURCE_DIR}/gpu/ + COMMAND make + ) +endif() + + #### Object libraries # See configure_object_library() function comments for more details. # WARNING: You MUST have compatible flags set between OBJECT libraries and targets that depend on them! @@ -299,9 +350,15 @@ add_library( $ ${CMAKE_BINARY_DIR}/__init__.py ) +if (${MOE_USE_GPU} MATCHES "1") + add_dependencies(GPP GPU_LIB) +endif() target_link_libraries(GPP ${PYTHON_LIBRARIES} ${Boost_LIBRARIES}) - +if (${MOE_USE_GPU} MATCHES "1") + target_link_libraries(GPP ${CUDA_LIBRARIES} ${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so) +endif() + # Make sure lib ends in ".so" set_target_properties( GPP PROPERTIES diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp new file mode 100644 index 00000000..0f89c5e9 --- /dev/null +++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp @@ -0,0 +1,211 @@ +/*! + \file gpp_expected_improvement_gpu.cpp + \rst + This file contains implementations of GPU related functions. They are actually C++ wrappers for + CUDA C functions defined in gpu/gpp_cuda_math.cu. +\endrst*/ + +#include "gpp_expected_improvement_gpu.hpp" + +#include + +#include +#include + +#include "gpp_common.hpp" +#include "gpp_exception.hpp" +#include "gpp_logging.hpp" +#include "gpp_math.hpp" +#include "gpp_random.hpp" + +#ifdef OL_GPU_ENABLED + +#include "gpu/gpp_cuda_math.hpp" +#include "driver_types.h" +#include "cuda_runtime.h" + +#endif + +namespace optimal_learning { + +#ifdef OL_GPU_ENABLED + +CudaDevicePointer::CudaDevicePointer(int num_doubles_in) : num_doubles(num_doubles_in) { + if (num_doubles_in > 0) { + CudaError error = CudaAllocateMemForDoubleVector(num_doubles, &ptr); + if (error.err != cudaSuccess) { + ptr = nullptr; + ThrowException(OptimalLearningCudaException(error)); + } + } else { + ptr = nullptr; + } +} + +CudaDevicePointer::~CudaDevicePointer() { + CudaFreeMem(ptr); +} + +OptimalLearningCudaException::OptimalLearningCudaException(const CudaError& error) + : OptimalLearningException(error.file_and_line_info, error.func_info, cudaGetErrorString(error.err)) { +} + +double CudaExpectedImprovementEvaluator::ComputeExpectedImprovement(StateType * ei_state) const { + double EI_val; + int num_union = ei_state->num_union; + gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data()); + gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data()); + int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data()); + if (unlikely(leading_minor_index != 0)) { + OL_THROW_EXCEPTION(SingularMatrixException, + "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.", + ei_state->cholesky_to_sample_var.data(), num_union, leading_minor_index); + } + uint64_t seed_in = (ei_state->uniform_rng->GetEngine())(); + OL_CUDA_ERROR_THROW(CudaGetEI(ei_state->to_sample_mean.data(), ei_state->cholesky_to_sample_var.data(), + num_union, num_mc_, seed_in, best_so_far_, ei_state->configure_for_test, + ei_state->random_number_ei.data(), &EI_val, ei_state->gpu_mu.ptr, + ei_state->gpu_chol_var.ptr, ei_state->gpu_random_number_ei.ptr, + ei_state->gpu_ei_storage.ptr)); + return EI_val; +} + +void CudaExpectedImprovementEvaluator::ComputeGradExpectedImprovement(StateType * ei_state, + double * restrict grad_ei) const { + if (ei_state->num_derivatives == 0) { + OL_THROW_EXCEPTION(OptimalLearningException, "configure_for_gradients set to false, gradient computation is disabled!"); + } + const int num_union = ei_state->num_union; + const int num_to_sample = ei_state->num_to_sample; + gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data()); + gaussian_process_->ComputeGradMeanOfPoints(ei_state->points_to_sample_state, ei_state->grad_mu.data()); + gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data()); + int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data()); + if (unlikely(leading_minor_index != 0)) { + OL_THROW_EXCEPTION(SingularMatrixException, + "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.", + ei_state->cholesky_to_sample_var.data(), num_union, leading_minor_index); + } + + gaussian_process_->ComputeGradCholeskyVarianceOfPoints(&(ei_state->points_to_sample_state), + ei_state->cholesky_to_sample_var.data(), + ei_state->grad_chol_decomp.data()); + uint64_t seed_in = (ei_state->uniform_rng->GetEngine())(); + + OL_CUDA_ERROR_THROW(CudaGetGradEI(ei_state->to_sample_mean.data(), ei_state->cholesky_to_sample_var.data(), + ei_state->grad_mu.data(), ei_state->grad_chol_decomp.data(), num_union, + num_to_sample, dim_, num_mc_, seed_in, best_so_far_, ei_state->configure_for_test, + ei_state->random_number_grad_ei.data(), grad_ei, ei_state->gpu_mu.ptr, + ei_state->gpu_chol_var.ptr, ei_state->gpu_grad_mu.ptr, ei_state->gpu_grad_chol_var.ptr, + ei_state->gpu_random_number_grad_ei.ptr, ei_state->gpu_grad_ei_storage.ptr)); +} + +void CudaExpectedImprovementEvaluator::SetupGPU(int devID) { + OL_CUDA_ERROR_THROW(CudaSetDevice(devID)); +} + +CudaExpectedImprovementEvaluator::CudaExpectedImprovementEvaluator(const GaussianProcess& gaussian_process_in, + int num_mc_in, double best_so_far, int devID_in) + : dim_(gaussian_process_in.dim()), + num_mc_(num_mc_in), + best_so_far_(best_so_far), + gaussian_process_(&gaussian_process_in) { + SetupGPU(devID_in); + } + +CudaExpectedImprovementEvaluator::~CudaExpectedImprovementEvaluator() { + cudaDeviceReset(); +} + +CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, + double const * restrict points_to_sample, + double const * restrict points_being_sampled, + int num_to_sample_in, int num_being_sampled_in, + bool configure_for_gradients, + UniformRandomGenerator * uniform_rng_in) + : dim(ei_evaluator.dim()), + num_to_sample(num_to_sample_in), + num_being_sampled(num_being_sampled_in), + num_derivatives(configure_for_gradients ? num_to_sample : 0), + num_union(num_to_sample + num_being_sampled), + union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)), + points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives), + uniform_rng(uniform_rng_in), + to_sample_mean(num_union), + grad_mu(dim*num_derivatives), + cholesky_to_sample_var(Square(num_union)), + grad_chol_decomp(dim*Square(num_union)*num_derivatives), + configure_for_test(false), + gpu_mu(num_union), + gpu_chol_var(Square(num_union)), + gpu_grad_mu(dim * num_derivatives), + gpu_grad_chol_var(dim * Square(num_union) * num_derivatives), + gpu_ei_storage(kEINumThreads * kEINumBlocks), + gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives), + gpu_random_number_ei(0), + gpu_random_number_grad_ei(0), + random_number_ei(0), + random_number_grad_ei(0) { +} + +CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, + double const * restrict points_to_sample, + double const * restrict points_being_sampled, + int num_to_sample_in, int num_being_sampled_in, + bool configure_for_gradients, + UniformRandomGenerator * uniform_rng_in, + bool configure_for_test_in) + : dim(ei_evaluator.dim()), + num_to_sample(num_to_sample_in), + num_being_sampled(num_being_sampled_in), + num_derivatives(configure_for_gradients ? num_to_sample : 0), + num_union(num_to_sample + num_being_sampled), + union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)), + points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives), + uniform_rng(uniform_rng_in), + to_sample_mean(num_union), + grad_mu(dim*num_derivatives), + cholesky_to_sample_var(Square(num_union)), + grad_chol_decomp(dim*Square(num_union)*num_derivatives), + configure_for_test(configure_for_test_in), + gpu_mu(num_union), + gpu_chol_var(Square(num_union)), + gpu_grad_mu(dim * num_derivatives), + gpu_grad_chol_var(dim * Square(num_union) * num_derivatives), + gpu_ei_storage(kEINumThreads * kEINumBlocks), + gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives), + gpu_random_number_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0), + gpu_random_number_grad_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0), + random_number_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0), + random_number_grad_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0) { +} + +std::vector CudaExpectedImprovementState::BuildUnionOfPoints(double const * restrict points_to_sample, + double const * restrict points_being_sampled, + int num_to_sample, int num_being_sampled, int dim) noexcept { + std::vector union_of_points(dim*(num_to_sample + num_being_sampled)); + std::copy(points_to_sample, points_to_sample + dim*num_to_sample, union_of_points.data()); + std::copy(points_being_sampled, points_being_sampled + dim*num_being_sampled, union_of_points.data() + dim*num_to_sample); + return union_of_points; +} + +int CudaExpectedImprovementState::GetVectorSize(int num_mc_itr, int num_threads, int num_blocks, int num_points) noexcept { + return ((static_cast(num_mc_itr / (num_threads * num_blocks)) + 1) * (num_threads * num_blocks) * num_points); +} + +void CudaExpectedImprovementState::UpdateCurrentPoint(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) { + // update points_to_sample in union_of_points + std::copy(points_to_sample, points_to_sample + num_to_sample*dim, union_of_points.data()); + + // evaluate derived quantities for the GP + points_to_sample_state.SetupState(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives); +} + +void CudaExpectedImprovementState::SetupState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) { + // update quantities derived from points_to_sample + UpdateCurrentPoint(ei_evaluator, points_to_sample); +} +#endif // OL_GPU_ENABLED + +} // end namespace optimal_learning + diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.hpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.hpp new file mode 100644 index 00000000..121fe513 --- /dev/null +++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.hpp @@ -0,0 +1,335 @@ +/*! + \file gpp_expected_improvement_gpu.hpp + \rst + All GPU related functions are declared here, and any other C++ functions who wish to call GPU functions should only call functions here. +\endrst*/ + +#ifndef MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_HPP_ +#define MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_HPP_ + +#include +#include + +#include "gpp_common.hpp" +#include "gpp_exception.hpp" +#include "gpp_logging.hpp" +#include "gpp_math.hpp" +#include "gpp_random.hpp" + +#ifdef OL_GPU_ENABLED + +#include "gpu/gpp_cuda_math.hpp" +/*!\rst + Macro that checks error message (CudaError object) returned by CUDA functions, and throws + OptimalLearningCudaException if there is error. +\endrst*/ +#define OL_CUDA_ERROR_THROW(X) do {CudaError _ERR = (X); if ((_ERR).err != cudaSuccess) {ThrowException(OptimalLearningCudaException(_ERR));}} while (0) + +#endif + +namespace optimal_learning { +#ifdef OL_GPU_ENABLED + +/*!\rst + This struct does the same job as C++ smart pointer. It contains pointer to memory location on + GPU, its constructor and destructor also take care of memory allocation/deallocation on GPU. +\endrst*/ +struct CudaDevicePointer final { + explicit CudaDevicePointer(int num_doubles_in); + + ~CudaDevicePointer(); + + //! pointer to the memory location on gpu + double* ptr; + //! number of doubles to allocate on gpu, so the memory size is num_doubles * sizeof(double) + int num_doubles; + + OL_DISALLOW_DEFAULT_AND_COPY_AND_ASSIGN(CudaDevicePointer); +}; + +/*!\rst + Exception to handle runtime errors returned by CUDA API functions. This class subclasses + OptimalLearningException in gpp_exception.hpp/cpp, and basiclly has the same functionality + as its superclass, except the constructor is different. +\endrst*/ +class OptimalLearningCudaException : public OptimalLearningException { + public: + //! String name of this exception ofr logging. + constexpr static char const * kName = "OptimalLearningCudaException"; + + /*!\rst + Constructs a OptimalLearningCudaException with struct CudaError + \param + :error: C struct that contains error message returned by CUDA API functions + \endrst*/ + explicit OptimalLearningCudaException(const CudaError& error); + + OL_DISALLOW_DEFAULT_AND_ASSIGN(OptimalLearningCudaException); +}; + +struct CudaExpectedImprovementState; + +/*!\rst + This class has the same functionality as ExpectedImprovementEvaluator (see gpp_math.hpp), + except that computations are performed on GPU. +\endrst*/ +class CudaExpectedImprovementEvaluator final { + public: + using StateType = CudaExpectedImprovementState; + /*!\rst + Constructor that also specify which gpu you want to use (for multi-gpu system) + \endrst*/ + CudaExpectedImprovementEvaluator(const GaussianProcess& gaussian_process_in, + int num_mc_in, double best_so_far, int devID_in); + + ~CudaExpectedImprovementEvaluator(); + + int dim() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT { + return dim_; + } + + int num_mc() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT { + return num_mc_; + } + + const GaussianProcess * gaussian_process() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT { + return gaussian_process_; + } + + /*!\rst + Wrapper for ComputeExpectedImprovement(); see that function for details. + \endrst*/ + double ComputeObjectiveFunction(StateType * ei_state) const OL_NONNULL_POINTERS OL_WARN_UNUSED_RESULT { + return ComputeExpectedImprovement(ei_state); + } + + /*!\rst + Wrapper for ComputeGradExpectedImprovement(); see that function for details. + \endrst*/ + void ComputeGradObjectiveFunction(StateType * ei_state, double * restrict grad_ei) const OL_NONNULL_POINTERS { + ComputeGradExpectedImprovement(ei_state, grad_ei); + } + + /*!\rst + This function has the same functionality as ComputeExpectedImprovement (see gpp_math.hpp) + in class ExpectedImprovementEvaluator. + \param + :ei_state[1]: properly configured state object + \output + :ei_state[1]: state with temporary storage modified; ``uniform_rng`` modified + \return + the expected improvement from sampling ``points_to_sample`` with ``points_being_sampled`` concurrent experiments + \endrst*/ + double ComputeExpectedImprovement(StateType * ei_state) const OL_NONNULL_POINTERS OL_WARN_UNUSED_RESULT; + + /*!\rst + This function has the same functionality as ComputeGradExpectedImprovement (see gpp_math.hpp) + in class ExpectedImprovementEvaluator. + \param + :ei_state[1]: properly configured state object + \output + :ei_state[1]: state with temporary storage modified; ``uniform_rng`` modified + :grad_ei[dim][num_to_sample]: gradient of EI + \endrst*/ + void ComputeGradExpectedImprovement(StateType * ei_state, double * restrict grad_ei) const OL_NONNULL_POINTERS; + + /*!\rst + Call CUDA API function to activate a GPU. + Refer to: http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g418c299b069c4803bfb7cab4943da383 + + \param + :devID: device ID of the GPU need to be activated + \endrst*/ + void SetupGPU(int devID); + + OL_DISALLOW_DEFAULT_AND_COPY_AND_ASSIGN(CudaExpectedImprovementEvaluator); + + private: + //! spatial dimension (e.g., entries per point of points_sampled) + const int dim_; + //! number of mc iterations + int num_mc_; + //! best (minimum) objective function value (in points_sampled_value) + double best_so_far_; + //! pointer to gaussian process used in EI computations + const GaussianProcess * gaussian_process_; +}; + +/*!\rst + This has the same functionality as ExpectedImprovementState (see gpp_math.hpp) except that it is for GPU computing +\endrst*/ +struct CudaExpectedImprovementState final { + using EvaluatorType = CudaExpectedImprovementEvaluator; + + /*!\rst + Constructs an CudaExpectedImprovementState object with a specified source of randomness for + the purpose of computing EI(and its gradient) over the specified set of points to sample. + This establishes properly sized/initialized temporaries for EI computation, including dependent + state from the associated Gaussian Process (which arrives as part of the ei_evaluator). + + .. WARNING:: This object is invalidated if the associated ei_evaluator is mutated. SetupState() + should be called to reset. + + .. WARNING:: + Using this object to compute gradients when ``configure_for_gradients`` := false results in + UNDEFINED BEHAVIOR. + + \param + :ei_evaluator: expected improvement evaluator object that specifies the parameters & GP for EI evaluation + :points_to_sample[dim][num_to_sample]: points at which to evaluate EI and/or its gradient to check their value in future experiments (i.e., test points for GP predictions) + :points_being_sampled[dim][num_being_sampled]: points being sampled in concurrent experiments + :num_to_sample: number of potential future samples; gradients are evaluated wrt these points (i.e., the "q" in q,p-EI) + :num_being_sampled: number of points being sampled in concurrent experiments (i.e., the "p" in q,p-EI) + :configure_for_gradients: true if this object will be used to compute gradients, false otherwise + :uniform_rng[1]: pointer to a properly initialized* UniformRandomGenerator object + + .. NOTE:: + * The UniformRandomGenerator object must already be seeded. If multithreaded computation is used for EI, then every state object + must have a different UniformRandomGenerator (different seeds, not just different objects). + \endrst*/ + CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample, + double const * restrict points_being_sampled, int num_to_sample_in, + int num_being_sampled_in, bool configure_for_gradients, + UniformRandomGenerator* uniform_rng_in); + + // constructor for setting up unit test + CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample, + double const * restrict points_being_sampled, int num_to_sample_in, + int num_being_sampled_in, bool configure_for_gradients, + UniformRandomGenerator * uniform_rng_in, bool configure_for_test); + + CudaExpectedImprovementState(CudaExpectedImprovementState&& OL_UNUSED(other)) = default; + + /*!\rst + Create a vector with the union of points_to_sample and points_being_sampled (the latter is appended to the former). + + Note the l-value return. Assigning the return to a std::vector or passing it as an argument to the ctor + will result in copy-elision or move semantics; no copying/performance loss. + + \param: + :points_to_sample[dim][num_to_sample]: points at which to evaluate EI and/or its gradient to check their value in future experiments (i.e., test points for GP predictions) + :points_being_sampled[dim][num_being_sampled]: points being sampled in concurrent experiments + :num_to_sample: number of potential future samples; gradients are evaluated wrt these points (i.e., the "q" in q,p-EI) + :num_being_sampled: number of points being sampled in concurrent experiments (i.e., the "p" in q,p-EI) + :dim: the number of spatial dimensions of each point array + \return + std::vector with the union of the input arrays: points_being_sampled is *appended* to points_to_sample + \endrst*/ + static std::vector BuildUnionOfPoints(double const * restrict points_to_sample, + double const * restrict points_being_sampled, + int num_to_sample, int num_being_sampled, int dim) + noexcept OL_WARN_UNUSED_RESULT; + + /*!\rst + A simple utility function to calculate how many random numbers will be generated by GPU computation of EI/gradEI given + number of MC simulations. (user set num_mc_itr is not necessarily equal to the actual num_mc_itr used in GPU computation, + because actual num_mc_itr has to be multiple of (num_threads * num_blocks) + + \param: + :num_mc_itr: number of MC simulations + :num_threads: number of threads per block in GPU computation + :num_blocks: number of blocks in GPU computation + :num_points: number of points interested (aka q+p) + \return + int: number of random numbers generated in GPU computation + \endrst*/ + static int GetVectorSize(int num_mc_itr, int num_threads, int num_blocks, int num_points) noexcept OL_WARN_UNUSED_RESULT; + + int GetProblemSize() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT { + return dim*num_to_sample; + } + + /*!\rst + Get the ``points_to_sample``: potential future samples whose EI (and/or gradients) are being evaluated + + \output + :points_to_sample[dim][num_to_sample]: potential future samples whose EI (and/or gradients) are being evaluated + \endrst*/ + void GetCurrentPoint(double * restrict points_to_sample) const noexcept OL_NONNULL_POINTERS { + std::copy(union_of_points.data(), union_of_points.data() + num_to_sample*dim, points_to_sample); + } + + /*!\rst + Change the potential samples whose EI (and/or gradient) are being evaluated. + Update the state's derived quantities to be consistent with the new points. + + \param + :ei_evaluator: expected improvement evaluator object that specifies the parameters & GP for EI evaluation + :points_to_sample[dim][num_to_sample]: potential future samples whose EI (and/or gradients) are being evaluated + \endrst*/ + void UpdateCurrentPoint(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) OL_NONNULL_POINTERS; + + /*!\rst + Configures this state object with new ``points_to_sample``, the location of the potential samples whose EI is to be evaluated. + Ensures all state variables & temporaries are properly sized. + Properly sets all dependent state variables (e.g., GaussianProcess's state) for EI evaluation. + + .. WARNING:: + This object's state is INVALIDATED if the ``ei_evaluator`` (including the GaussianProcess it depends on) used in + SetupState is mutated! SetupState() should be called again in such a situation. + + \param + :ei_evaluator: expected improvement evaluator object that specifies the parameters & GP for EI evaluation + :points_to_sample[dim][num_to_sample]: potential future samples whose EI (and/or gradients) are being evaluated + \endrst*/ + void SetupState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) OL_NONNULL_POINTERS; + + // size information + //! spatial dimension (e.g., entries per point of ``points_sampled``) + const int dim; + //! number of potential future samples; gradients are evaluated wrt these points (i.e., the "q" in q,p-EI) + const int num_to_sample; + //! number of points being sampled concurrently (i.e., the "p" in q,p-EI) + const int num_being_sampled; + //! number of derivative terms desired (usually 0 for no derivatives or num_to_sample) + const int num_derivatives; + //! number of points in union_of_points: num_to_sample + num_being_sampled + const int num_union; + + //! points currently being sampled; this is the union of the points represented by "q" and "p" in q,p-EI + //! ``points_to_sample`` is stored first in memory, immediately followed by ``points_being_sampled`` + std::vector union_of_points; + + //! gaussian process state + GaussianProcess::StateType points_to_sample_state; + + //! random number generator + UniformRandomGenerator* uniform_rng; + + // temporary storage: preallocated space used by CudaExpectedImprovementEvaluator's member functions + //! the mean of the GP evaluated at union_of_points + std::vector to_sample_mean; + //! the gradient of the GP mean evaluated at union_of_points, wrt union_of_points[0:num_to_sample] + std::vector grad_mu; + //! the cholesky (``LL^T``) factorization of the GP variance evaluated at union_of_points + std::vector cholesky_to_sample_var; + //! the gradient of the cholesky (``LL^T``) factorization of the GP variance evaluated at union_of_points wrt union_of_points[0:num_to_sample] + std::vector grad_chol_decomp; + + bool configure_for_test; + //! structs containing pointers to store the memory locations of variables on GPU + //! input data for GPU computations and GPU should not modify them + CudaDevicePointer gpu_mu; + CudaDevicePointer gpu_chol_var; + CudaDevicePointer gpu_grad_mu; + CudaDevicePointer gpu_grad_chol_var; + //! data containing results returned by GPU computations + CudaDevicePointer gpu_ei_storage; + CudaDevicePointer gpu_grad_ei_storage; + //! data containing random numbers used in GPU computations, which are only + //! used for testing + CudaDevicePointer gpu_random_number_ei; + CudaDevicePointer gpu_random_number_grad_ei; + + //! storage for random numbers used in computing EI & grad_ei, this is only used to setup unit test + std::vector random_number_ei; + std::vector random_number_grad_ei; + + OL_DISALLOW_DEFAULT_AND_COPY_AND_ASSIGN(CudaExpectedImprovementState); +}; + +#endif // OL_GPU_ENABLED + +} // end namespace optimal_learning + +#endif // MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_HPP_ diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp new file mode 100644 index 00000000..8deaba72 --- /dev/null +++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp @@ -0,0 +1,262 @@ +/*! + \file gpp_expected_improvement_gpu_test.cpp + \rst + Routines to test the functions in gpp_expected_improvement_gpu.cpp. + + The tests verify ExpectedImprovementGPUEvaluator from gpp_expected_improvement_gpu.cpp. + + 1. Monte-Carlo EI vs analytic EI validation: the monte-carlo versions are run to "high" accuracy and checked against + analytic formulae when applicable + + 2. GPU EI vs CPU EI: both use monte-carlo version and run consistency check on various random sample points +\endrst*/ + +#include "gpp_expected_improvement_gpu_test.hpp" + +#include +#include + +#include // NOLINT(build/include_order) + +#include "gpp_common.hpp" +#include "gpp_exception.hpp" +#include "gpp_expected_improvement_gpu.hpp" +#include "gpp_math.hpp" +#include "gpp_random.hpp" +#include "gpp_test_utils.hpp" + +namespace optimal_learning { + +#ifdef OL_GPU_ENABLED + +namespace { +/*!\rst + Test that the EI + grad EI computation (using MC integration) is consistent + with the special analytic case of EI when there is only *ONE* potential point + to sample. + + Generates a set of 40 random test cases for expected improvement with only one potential sample. + The general EI (which uses MC integration) is evaluated to reasonably high accuracy (while not taking too long to run) + and compared against the analytic formula version for consistency. The gradients (spatial) of EI are also checked. + + \return + number of cases where analytic and monte-carlo EI do not match +\endrst*/ +int RunCudaEIConsistencyTests() { + int total_errors = 0; + + int which_gpu = 1; + const int num_mc_iter = 20000000; + const int dim = 3; + const int num_being_sampled = 0; + const int num_to_sample = 1; + const int num_sampled = 7; + + double alpha = 2.80723; + // set best_so_far to be larger than max(points_sampled_value) (but don't make it huge or stability will be suffer) + double best_so_far = 10.0; + bool configure_for_gradients = true; + + UniformRandomGenerator uniform_generator(31278); + boost::uniform_real uniform_double(0.5, 2.5); + + MockExpectedImprovementEnvironment ei_environment; + + std::vector lengths(dim); + std::vector noise_variance(num_sampled, 0.0); + std::vector grad_ei_cuda(dim); + std::vector grad_ei_one_potential_sample(dim); + double ei_cuda; + double ei_one_potential_sample; + + for (int i = 0; i < 40; ++i) { + ei_environment.Initialize(dim, num_to_sample, num_being_sampled, num_sampled, &uniform_generator); + for (int j = 0; j < dim; ++j) { + lengths[j] = uniform_double(uniform_generator.engine); + } + SquareExponential sqexp_covariance(dim, alpha, lengths); + GaussianProcess gaussian_process(sqexp_covariance, ei_environment.points_sampled(), ei_environment.points_sampled_value(), noise_variance.data(), dim, num_sampled); + + OnePotentialSampleExpectedImprovementEvaluator one_potential_sample_ei_evaluator(gaussian_process, best_so_far); + OnePotentialSampleExpectedImprovementEvaluator::StateType one_potential_sample_ei_state(one_potential_sample_ei_evaluator, ei_environment.points_to_sample(), configure_for_gradients); + + CudaExpectedImprovementEvaluator cuda_ei_evaluator(gaussian_process, num_mc_iter, best_so_far, which_gpu); + CudaExpectedImprovementEvaluator::StateType cuda_ei_state(cuda_ei_evaluator, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, configure_for_gradients, &uniform_generator); + + ei_cuda = cuda_ei_evaluator.ComputeObjectiveFunction(&cuda_ei_state); + cuda_ei_evaluator.ComputeGradObjectiveFunction(&cuda_ei_state, grad_ei_cuda.data()); + ei_one_potential_sample = one_potential_sample_ei_evaluator.ComputeObjectiveFunction(&one_potential_sample_ei_state); + one_potential_sample_ei_evaluator.ComputeGradObjectiveFunction(&one_potential_sample_ei_state, grad_ei_one_potential_sample.data()); + + int ei_errors_this_iteration = 0; + if (!CheckDoubleWithinRelative(ei_cuda, ei_one_potential_sample, 5.0e-3)) { + ++ei_errors_this_iteration; + } + if (ei_errors_this_iteration != 0) { + OL_PARTIAL_FAILURE_PRINTF("in EI on iteration %d\n", i); + } + total_errors += ei_errors_this_iteration; + + int grad_ei_errors_this_iteration = 0; + for (int j = 0; j < dim; ++j) { + if (!CheckDoubleWithinRelative(grad_ei_cuda[j], grad_ei_one_potential_sample[j], 4.5e-3)) { + ++grad_ei_errors_this_iteration; + } + } + + if (grad_ei_errors_this_iteration != 0) { + OL_PARTIAL_FAILURE_PRINTF("in EI gradients on iteration %d\n", i); + } + total_errors += grad_ei_errors_this_iteration; + } + + if (total_errors != 0) { + OL_PARTIAL_FAILURE_PRINTF("comparing MC EI to analytic EI failed with %d total_errors\n\n", total_errors); + } else { + OL_PARTIAL_SUCCESS_PRINTF("comparing MC EI to analytic EI passed\n"); + } + return total_errors; +} + +/*!\rst + Tests that the general EI + grad EI computation on CPU (using MC integration) is consistent + with the computation on GPU. We use exactly the same sequences of normal random numbers on + CPU and GPU so that they are supposed to output the same result even if the number of MC + iterations is small. + + Generates a set of 10 random test cases for genral q,p-EI computed on cpu vs gpu. + The computations on cpu and gpu use the same set of normal random numbers for MC + simulation, so that we can make sure the outputs should be consistent, even with + a relative small number of MC iteration. + + \return + number of cases where outputs from cpu and gpu do not match. +\endrst*/ +int RunCudaEIvsCpuEITests() { + int total_errors = 0; + + int which_gpu = 1; + const int num_mc_iter = 40000; + const int dim = 3; + const int num_being_sampled = 4; + const int num_to_sample = 4; + const int num_sampled = 20; + + double alpha = 2.80723; + // set best_so_far to be larger than max(points_sampled_value) (but don't make it huge or stability will be suffer) + double best_so_far = 10.0; + bool configure_for_gradients = true; + bool configure_for_test = true; + + UniformRandomGenerator uniform_generator(31278); + boost::uniform_real uniform_double(0.5, 2.5); + + MockExpectedImprovementEnvironment ei_environment; + + std::vector lengths(dim); + std::vector noise_variance(num_sampled, 0.0); + std::vector grad_ei_cpu(dim*num_to_sample); + std::vector grad_ei_gpu(dim*num_to_sample); + std::vector normal_random_table; + double ei_cpu; + double ei_gpu; + int cpu_num_iter; + + for (int i = 0; i < 10; ++i) { + ei_environment.Initialize(dim, num_to_sample, num_being_sampled, num_sampled, &uniform_generator); + for (int j = 0; j < dim; ++j) { + lengths[j] = uniform_double(uniform_generator.engine); + } + SquareExponential sqexp_covariance(dim, alpha, lengths); + GaussianProcess gaussian_process(sqexp_covariance, ei_environment.points_sampled(), ei_environment.points_sampled_value(), noise_variance.data(), dim, num_sampled); + + CudaExpectedImprovementEvaluator cuda_ei_evaluator(gaussian_process, num_mc_iter, best_so_far, which_gpu); + CudaExpectedImprovementEvaluator::StateType cuda_ei_state(cuda_ei_evaluator, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, configure_for_gradients, &uniform_generator, configure_for_test); + + ei_gpu = cuda_ei_evaluator.ComputeObjectiveFunction(&cuda_ei_state); + + // setup cpu EI computation + normal_random_table = cuda_ei_state.random_number_ei; + cpu_num_iter = normal_random_table.size()/ (num_being_sampled + num_to_sample); + NormalRNGSimulator normal_rng_for_ei(normal_random_table); + ExpectedImprovementEvaluator ei_evaluator_for_ei(gaussian_process, cpu_num_iter, best_so_far); + ExpectedImprovementEvaluator::StateType ei_state_for_ei(ei_evaluator_for_ei, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, false, &normal_rng_for_ei); + ei_cpu = ei_evaluator_for_ei.ComputeObjectiveFunction(&ei_state_for_ei); + + // setup cpu gradEI computation + cuda_ei_evaluator.ComputeGradObjectiveFunction(&cuda_ei_state, grad_ei_gpu.data()); + + normal_random_table = cuda_ei_state.random_number_grad_ei; + cpu_num_iter = normal_random_table.size()/ (num_being_sampled + num_to_sample); + NormalRNGSimulator normal_rng_for_grad_ei(normal_random_table); + ExpectedImprovementEvaluator ei_evaluator_for_grad_ei(gaussian_process, cpu_num_iter, best_so_far); + ExpectedImprovementEvaluator::StateType ei_state_for_grad_ei(ei_evaluator_for_grad_ei, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, true, &normal_rng_for_grad_ei); + ei_evaluator_for_grad_ei.ComputeGradObjectiveFunction(&ei_state_for_grad_ei, grad_ei_cpu.data()); + + int ei_errors_this_iteration = 0; + if (!CheckDoubleWithinRelative(ei_cpu, ei_gpu, 1.0e-12)) { + ++ei_errors_this_iteration; + } + if (ei_errors_this_iteration != 0) { + OL_PARTIAL_FAILURE_PRINTF("in EI on iteration %d\n", i); + } + total_errors += ei_errors_this_iteration; + + int grad_ei_errors_this_iteration = 0; + for (int j = 0; j < dim*num_to_sample; ++j) { + if (!CheckDoubleWithinRelative(grad_ei_cpu[j], grad_ei_gpu[j], 1.0e-12)) { + ++grad_ei_errors_this_iteration; + } + } + + if (grad_ei_errors_this_iteration != 0) { + OL_PARTIAL_FAILURE_PRINTF("in EI gradients on iteration %d\n", i); + } + total_errors += grad_ei_errors_this_iteration; + } + + if (total_errors != 0) { + OL_PARTIAL_FAILURE_PRINTF("comparing cpu EI to gpu EI failed with %d total_errors\n\n", total_errors); + } else { + OL_PARTIAL_SUCCESS_PRINTF("comparing cpu EI to gpu EI passed\n"); + } + return total_errors; +} + +} // end unnamed namespace + +/*!\rst + Invoke all tests for GPU functions. + \return + number of test failures: 0 if all is working well. +\endrst*/ +int RunGPUTests() { + int total_errors = 0; + int error = RunCudaEIConsistencyTests(); + if (error != 0) { + OL_FAILURE_PRINTF("analytic, Cuda EI do not match for 1 potential sample case\n"); + } else { + OL_SUCCESS_PRINTF("analytic, Cuda EI match for 1 potential sample case\n"); + } + total_errors += error; + + error = RunCudaEIvsCpuEITests(); + if (error != 0) { + OL_FAILURE_PRINTF("cudaEI vs cpuEI consistency check failed\n"); + } else { + OL_SUCCESS_PRINTF("cudaEI vs cpuEI consistency check successed\n"); + } + total_errors += error; + return total_errors; +} + +#else // OL_GPU_ENABLED + +int RunGPUTests() { + OL_WARNING_PRINTF("no gpu component is enabled, this test did not run.\n"); + return 0; +} + +#endif // OL_GPU_ENABLED + +} // end namespace optimal_learning diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.hpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.hpp new file mode 100644 index 00000000..92b51bfa --- /dev/null +++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.hpp @@ -0,0 +1,29 @@ +/*! + \file gpp_expected_improvement_gpu_test.hpp + \rst + Functions for testing expected improvement functions on GPU. + + Tests are broken into two main groups: + + * consistency test against analytical 1,0-EI result + * compare with CPU(MC) results + .. Note:: These tests do not run if GPU computation (``OL_GPU_ENABLED``) is disabled. + +\endrst*/ + +#ifndef MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_TEST_HPP_ +#define MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_TEST_HPP_ + +#include "gpp_common.hpp" + +namespace optimal_learning { +/*!\rst + Invoke all tests for GPU functions. + \return + number of test failures: 0 if all is working well. +\endrst*/ +OL_WARN_UNUSED_RESULT int RunGPUTests(); + +} // end namespace optimal_learning +#endif // MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_TEST_HPP_ + diff --git a/moe/optimal_learning/cpp/gpp_python_test.cpp b/moe/optimal_learning/cpp/gpp_python_test.cpp index d72c7f37..e9725f30 100644 --- a/moe/optimal_learning/cpp/gpp_python_test.cpp +++ b/moe/optimal_learning/cpp/gpp_python_test.cpp @@ -17,6 +17,7 @@ #include "gpp_covariance_test.hpp" #include "gpp_domain.hpp" #include "gpp_domain_test.hpp" +#include "gpp_expected_improvement_gpu_test.hpp" #include "gpp_geometry_test.hpp" #include "gpp_heuristic_expected_improvement_optimization_test.hpp" #include "gpp_linear_algebra_test.hpp" @@ -75,6 +76,14 @@ int RunCppTestsWrapper() { } total_errors += error; + error = RunGPUTests(); + if (error != 0) { + OL_FAILURE_PRINTF("GPU tests failed\n"); + } else { + OL_SUCCESS_PRINTF("GPU tests passed\n"); + } + total_errors += error; + error = RunLogLikelihoodPingTests(); if (error != 0) { OL_FAILURE_PRINTF("LogLikelihood ping tests failed\n"); diff --git a/moe/optimal_learning/cpp/gpp_random.cpp b/moe/optimal_learning/cpp/gpp_random.cpp index a3d9f827..0af9d294 100644 --- a/moe/optimal_learning/cpp/gpp_random.cpp +++ b/moe/optimal_learning/cpp/gpp_random.cpp @@ -147,7 +147,7 @@ double NormalRNGSimulator::operator()() { int size_of_table = random_number_table_.size(); if (index_ < size_of_table) { ++index_; - return random_number_table_[index_]; + return random_number_table_[index_-1]; } else { OL_THROW_EXCEPTION(InvalidValueException, "All random numbers stored in the RNG have been used up!", index_, size_of_table); } diff --git a/moe/optimal_learning/cpp/gpp_test_utils.cpp b/moe/optimal_learning/cpp/gpp_test_utils.cpp index 35df2219..16dddf44 100644 --- a/moe/optimal_learning/cpp/gpp_test_utils.cpp +++ b/moe/optimal_learning/cpp/gpp_test_utils.cpp @@ -177,9 +177,9 @@ bool CheckDoubleWithin(double value, double truth, double tolerance) noexcept { return passed; } -bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noexcept { +bool CheckDoubleWithinRelativeWithThreshold(double value, double truth, double tolerance, double threshold) noexcept { double denom = std::fabs(truth); - if (denom < std::numeric_limits::min()) { + if (denom < threshold) { denom = 1.0; // don't divide by 0 } double diff = std::fabs((value - truth)/denom); @@ -187,10 +187,13 @@ bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noe if (passed != true) { OL_ERROR_PRINTF("value = %.18E, truth = %.18E, diff = %.18E, tol = %.18E\n", value, truth, diff, tolerance); } - return passed; } +bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noexcept { + return CheckDoubleWithinRelativeWithThreshold(value, truth, tolerance, std::numeric_limits::min()); +} + /*!\rst Uses the Frobenius Norm for convenience; matrix 2-norms are expensive to compute. \endrst*/ diff --git a/moe/optimal_learning/cpp/gpp_test_utils.hpp b/moe/optimal_learning/cpp/gpp_test_utils.hpp index 6edc6524..be0258c7 100644 --- a/moe/optimal_learning/cpp/gpp_test_utils.hpp +++ b/moe/optimal_learning/cpp/gpp_test_utils.hpp @@ -428,6 +428,21 @@ bool CheckDoubleWithin(double value, double truth, double tolerance) noexcept OL \endrst*/ bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT; +/*!\rst + Checks if ``|value - truth| / |truth| <= tolerance`` (relative error) + + If truth < threshold, CheckDoubleWithin() is performed. + + \param + :value: number to be tested + :truth: the exact/desired result + :tolerance: permissible relative difference + :threshold: tolerance = |value - truth| if |truth| < threshold, this is to control unexpected large or undefined relative diff when truth is "too small" (0 for example) + \return + true if value, truth differ relatively by no more than tolerance. +\endrst*/ +bool CheckDoubleWithinRelativeWithThreshold(double value, double truth, double tolerance, double threshold) noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT; + /*!\rst Checks that ``||A - B||_F <= tolerance`` diff --git a/moe/optimal_learning/cpp/gpu/CMakeLists.txt b/moe/optimal_learning/cpp/gpu/CMakeLists.txt new file mode 100644 index 00000000..d2c9888f --- /dev/null +++ b/moe/optimal_learning/cpp/gpu/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 2.8.9) +find_package(CUDA 5.0 REQUIRED) +CUDA_INCLUDE_DIRECTORIES( + ${CUDA_INCLUDE_DIRS} + ${MOE_CUDA_SDK_INCLUDE_DIRS} + ) + +# Provide a list of actual .cu files +set(CUDA_SRCS + gpp_cuda_math.cu + ) + +# Compiler flags - eg specify different compute capabilites +list(APPEND CUDA_NVCC_FLAGS --gpu-architecture sm_20) + +CUDA_ADD_LIBRARY( + OL_GPU + ${CUDA_SRCS} + SHARED + ) diff --git a/moe/optimal_learning/cpp/gpu/gpp_cuda_math.cu b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.cu new file mode 100644 index 00000000..bffb225b --- /dev/null +++ b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.cu @@ -0,0 +1,394 @@ +/*! + \file gpp_cuda_math.cu + \rst + This file contains implementations of all GPU functions. There are both device code (executed on + GPU device) and host code (executed on CPU), and they are compiled by NVCC, which is a NVIDIA CUDA + compiler. +\endrst*/ + +#include "gpp_cuda_math.hpp" + +#include +#include +#include +#include +#include + +#include + +/*!\rst + Macro to stringify the expansion of a macro. For example, say we are on line 53: + + * ``#__LINE__ --> "__LINE__"`` + * ``OL_CUDA_STRINGIFY_EXPANSION(__LINE__) --> "53"`` + + ``OL_CUDA_STRINGIFY_EXPANSION_INNER`` is not meant to be used directly; + but we need ``#x`` in a macro for this expansion to work. + + This is a standard trick; see bottom of: + http://gcc.gnu.org/onlinedocs/cpp/Stringification.html +\endrst*/ +#define OL_CUDA_STRINGIFY_EXPANSION_INNER(x) #x +#define OL_CUDA_STRINGIFY_EXPANSION(x) OL_CUDA_STRINGIFY_EXPANSION_INNER(x) + +/*!\rst + Macro to stringify and format the current file and line number. For + example, if the macro is invoked from line 893 of file gpp_foo.cpp, + this macro produces the compile-time string-constant: + ``(gpp_foo.cpp: 893)`` +\endrst*/ +#define OL_CUDA_STRINGIFY_FILE_AND_LINE "(" __FILE__ ": " OL_CUDA_STRINGIFY_EXPANSION(__LINE__) ")" + +/*!\rst + Macro that checks error message (with type cudaError_t) returned by CUDA API functions, and if there is error occurred, + the macro produces a C struct containing error message, function name where error occured, file name and line info, and + then terminate the function. +\endrst*/ +#define OL_CUDA_ERROR_RETURN(X) do {cudaError_t _error_code = (X); if (_error_code != cudaSuccess) {CudaError _err = {_error_code, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__}; return _err;}} while (0) + +namespace optimal_learning { + +namespace { // functions run on gpu device +/*!\rst + Special case of GeneralMatrixVectorMultiply. As long as A has zeros in the strict upper-triangle, + GeneralMatrixVectorMultiply will work too (but take ``>= 2x`` as long). + + Computes results IN-PLACE. + Avoids accessing the strict upper triangle of A. + + Should be equivalent to BLAS call: + ``dtrmv('L', trans, 'N', size_m, A, size_m, x, 1);`` +\endrst*/ +__device__ void CudaTriangularMatrixVectorMultiply(double const * __restrict__ A, int size_m, double * __restrict__ x) { + double temp; + A += size_m * (size_m-1); + for (int j = size_m-1; j >= 0; --j) { // i.e., j >= 0 + temp = x[j]; + for (int i = size_m-1; i >= j+1; --i) { + // handles sub-diagonal contributions from j-th column + x[i] += temp*A[i]; + } + x[j] *= A[j]; // handles j-th on-diagonal component + A -= size_m; + } +} + +/*!\rst + This is reduced version of GeneralMatrixVectorMultiply(...) in gpp_linear_algebra.cpp, and this function computes + y = y - A * x (aka alpha = -1.0, beta = 1.0) +\endrst*/ +__device__ void CudaGeneralMatrixVectorMultiply(double const * __restrict__ A, double const * __restrict__ x, int size_m, int size_n, int lda, double * __restrict__ y) { + double temp; + for (int i = 0; i < size_n; ++i) { + temp = -1.0 * x[i]; + for (int j = 0; j < size_m; ++j) { + y[j] += A[j]*temp; + } + A += lda; + } +} + +/*!\rst + This inline function copies [begin, begin+1, ..., end-1] elements from one array to the other, if bound < end, then end = bound +\endrst*/ +__forceinline__ __device__ void CudaCopyElements(int begin, int end, int bound, double const * __restrict__ origin, double * __restrict__ destination) { + int local_end = end < bound ? end : bound; + for (int idx = begin; idx < local_end; ++idx) { + destination[idx] = origin[idx]; + } +} + +/*!\rst + Device code to compute Expected Improvement by Monte-Carlo on GPU + \param + :mu[num_union]: the mean of the GP evaluated at points interested + :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested + :num_union: number of the points interested + :num_iteration: number of iterations performed on each thread for MC evaluation + :best: best function evaluation obtained so far + :seed: seed for RNG + :ei_storage[num_threads][num_blocks]: array storing values of EI on GPU + :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: array storing random + numbers used for computing EI, for testing purpose only + :configure_for_test: whether record random_number_ei or not + \output + :ei_storage[num_threads][num_blocks]: each thread write result of computed EI to its corresponding position + :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: write random numbers + used for computing EI into the array, for testing purpose only +\endrst*/ +__global__ void CudaComputeEIGpu(double const * __restrict__ mu, double const * __restrict__ chol_var, + int num_union, int num_iteration, double best, uint64_t seed, + double * __restrict__ ei_storage, double* __restrict__ gpu_random_number_ei, + bool configure_for_test) { + // copy mu, chol_var to shared memory mu_local & chol_var_local + // For multiple dynamically sized arrays in a single kernel, declare a single extern unsized array, and use + // pointers into it to divide it into multiple arrays + // refer to http://devblogs.nvidia.com/parallelforall/using-shared-memory-cuda-cc/ + extern __shared__ double storage[]; + double * chol_var_local = storage; + double * mu_local = chol_var_local + num_union * num_union; + const int idx = threadIdx.x; + const int IDX = threadIdx.x + blockDim.x * blockIdx.x; + int chunk_size = (num_union * num_union - 1)/ blockDim.x + 1; + CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union * num_union, chol_var, chol_var_local); + chunk_size = (num_union - 1)/ blockDim.x + 1; + CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union, mu, mu_local); + __syncthreads(); + + // MC start + // RNG setup + uint64_t local_seed = seed + IDX; + curandState random_state; + // seed a random number generator + curand_init(local_seed, 0, 0, &random_state); + + double *normals = reinterpret_cast(malloc(sizeof(*chol_var_local) * num_union)); + double agg = 0.0; + double improvement_this_step; + double EI; + + for (int mc = 0; mc < num_iteration; ++mc) { + improvement_this_step = 0.0; + for (int i = 0; i < num_union; ++i) { + normals[i] = curand_normal_double(&random_state); + // If configure_for_test is true, random numbers used in MC computations will be saved as output. + // In fact we will let EI compuation on CPU use the same sequence of random numbers saved here, + // so that EI compuation on CPU & GPU can be compared directly for unit test purpose. + if (configure_for_test) { + gpu_random_number_ei[IDX * num_iteration * num_union + mc * num_union + i] = normals[i]; + } + } + CudaTriangularMatrixVectorMultiply(chol_var_local, num_union, normals); + for (int i = 0; i < num_union; ++i) { + EI = best - (mu_local[i] + normals[i]); + improvement_this_step = fmax(EI, improvement_this_step); + } + agg += improvement_this_step; + } + ei_storage[IDX] = agg / static_cast(num_iteration); + free(normals); +} + +/*!\rst + Device code to compute Gradient of Expected Improvement by Monte-Carlo on GPU + \param + :mu[num_union]: the mean of the GP evaluated at points interested + :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested + :grad_mu[dim][num_to_sample]: the gradient of mean of the GP evaluated at points interested + :grad_chol_var[dim][num_union][num_union][num_to_sample]: gradient of cholesky factorization of the GP variance + evaluated at points interested + :num_union: number of the union of points (aka q+p) + :num_to_sample: number of points to sample (aka q) + :dim: dimension of point space + :num_iteration: number of iterations performed on each thread for MC evaluation + :best: best function evaluation obtained so far + :seed: seed for RNG + :grad_ei_storage[dim][num_to_sample][num_threads][num_blocks]: A vector storing result of grad_ei from each thread + :gpu_random_number_grad_ei[num_union][num_itreration][num_threads][num_blocks]: array storing + random numbers used for computing gradEI, for testing purpose only + :configure_for_test: whether record random_number_grad_ei or not + \output + :grad_ei_storage[dim][num_to_sample][num_threads][num_blocks]: each thread write result of grad_ei + to its corresponding positions + :gpu_random_number_grad_ei[num_union][num_iteration][num_threads][num_blocks]: write random numbers + used for computing gradEI to the array, for testing purpose only +\endrst*/ +__global__ void CudaComputeGradEIGpu(double const * __restrict__ mu, double const * __restrict__ chol_var, + double const * __restrict__ grad_mu, double const * __restrict__ grad_chol_var, + int num_union, int num_to_sample, int dim, int num_iteration, double best, + uint64_t seed, double * __restrict__ grad_ei_storage, + double* __restrict__ gpu_random_number_grad_ei, bool configure_for_test) { + // copy mu, chol_var, grad_mu, grad_chol_var to shared memory + extern __shared__ double storage[]; + double * mu_local = storage; + double * chol_var_local = mu_local + num_union; + double * grad_mu_local = chol_var_local + num_union * num_union; + double * grad_chol_var_local = grad_mu_local + num_to_sample * dim; + const int idx = threadIdx.x; + const int IDX = threadIdx.x + blockDim.x * blockIdx.x; + int chunk_size = (num_to_sample * num_union * num_union * dim - 1)/ blockDim.x + 1; + CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_to_sample * num_union * num_union * dim, + grad_chol_var, grad_chol_var_local); + chunk_size = (num_union * num_union - 1)/ blockDim.x + 1; + CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union * num_union, chol_var, chol_var_local); + chunk_size = (num_to_sample * dim - 1)/ blockDim.x + 1; + CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_to_sample * dim, grad_mu, grad_mu_local); + chunk_size = (num_union - 1)/ blockDim.x + 1; + CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union, mu, mu_local); + __syncthreads(); + + int i, k, mc, winner; + double EI, improvement_this_step; + // RNG setup + uint64_t local_seed = seed + IDX; + curandState random_state; + curand_init(local_seed, 0, 0, &random_state); + double* normals = reinterpret_cast(malloc(sizeof(*mu_local) * num_union)); + double* normals_copy = reinterpret_cast(malloc(sizeof(*mu_local) * num_union)); + // initialize grad_ei_storage + for (int i = 0; i < (num_to_sample * dim); ++i) { + grad_ei_storage[IDX*num_to_sample*dim + i] = 0.0; + } + // MC step start + for (mc = 0; mc < num_iteration; ++mc) { + improvement_this_step = 0.0; + winner = -1; + for (i = 0; i < num_union; ++i) { + normals[i] = curand_normal_double(&random_state); + normals_copy[i] = normals[i]; + // If configure_for_test is true, random numbers used in MC computations will be saved as output. + // In fact we will let grad_ei compuation on CPU use the same sequence of random numbers saved here, + // so that grad_ei compuation on CPU & GPU can be compared directly for unit test purpose. + if (configure_for_test) { + gpu_random_number_grad_ei[IDX * num_iteration * num_union + mc * num_union + i] = normals[i]; + } + } + CudaTriangularMatrixVectorMultiply(chol_var_local, num_union, normals); + for (i = 0; i < num_union; ++i) { + EI = best - (mu_local[i] + normals[i]); + if (EI > improvement_this_step) { + improvement_this_step = EI; + winner = i; + } + } + if (improvement_this_step > 0.0) { + if (winner < num_to_sample) { + for (k = 0; k < dim; ++k) { + grad_ei_storage[IDX*num_to_sample*dim + winner * dim + k] -= grad_mu_local[winner * dim + k]; + } + } + for (i = 0; i < num_to_sample; ++i) { // derivative w.r.t ith point + CudaGeneralMatrixVectorMultiply(grad_chol_var_local + i * num_union * num_union * dim + + winner * num_union * dim, normals_copy, dim, num_union, + dim, grad_ei_storage + IDX * num_to_sample * dim + i * dim); + } + } + } + + for (int i = 0; i < num_to_sample*dim; ++i) { + grad_ei_storage[IDX*num_to_sample*dim + i] /= static_cast(num_iteration); + } + free(normals); + free(normals_copy); +} + +} // end unnamed namespace + +CudaError CudaAllocateMemForDoubleVector(int num_doubles, double** __restrict__ address_of_ptr_to_gpu_memory) { + CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__}; + int mem_size = num_doubles * sizeof(**address_of_ptr_to_gpu_memory); + OL_CUDA_ERROR_RETURN(cudaMalloc(reinterpret_cast(address_of_ptr_to_gpu_memory), mem_size)); + return _success; +} + +void CudaFreeMem(double* __restrict__ ptr_to_gpu_memory) { + cudaFree(ptr_to_gpu_memory); +} + +CudaError CudaGetEI(double * __restrict__ mu, double * __restrict__ chol_var, int num_union, int num_mc, + uint64_t seed, double best, bool configure_for_test, double * __restrict__ random_number_ei, + double * __restrict__ ei_val, double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var, + double* __restrict__ gpu_random_number_ei, double * __restrict__ gpu_ei_storage) { + *ei_val = 0.0; + CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__}; + + // We assign kEINumBlocks blocks and kEINumThreads threads/block for EI computation, so there are + // (kEINumBlocks * kEINumThreads) threads in total to execute kernel function in parallel + dim3 threads(kEINumThreads); + dim3 grid(kEINumBlocks); + double ei_storage[kEINumThreads * kEINumBlocks]; + int num_iteration = num_mc / (kEINumThreads * kEINumBlocks) + 1; // make sure num_iteration is always >= 1 + + int mem_size_mu = num_union * sizeof(*mu); + int mem_size_chol_var = num_union * num_union * sizeof(*mu); + int mem_size_ei_storage = kEINumThreads * kEINumBlocks * sizeof(*mu); + // copy mu, chol_var to GPU + OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_mu, mu, mem_size_mu, cudaMemcpyHostToDevice)); + OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_chol_var, chol_var, mem_size_chol_var, cudaMemcpyHostToDevice)); + // execute kernel + CudaComputeEIGpu <<< grid, threads, num_union*sizeof(*mu)+num_union*num_union*sizeof(*mu) >>> + (gpu_mu, gpu_chol_var, num_union, num_iteration, best, seed, gpu_ei_storage, + gpu_random_number_ei, configure_for_test); + OL_CUDA_ERROR_RETURN(cudaPeekAtLastError()); + // copy gpu_ei_storage back to CPU + OL_CUDA_ERROR_RETURN(cudaMemcpy(ei_storage, gpu_ei_storage, mem_size_ei_storage, cudaMemcpyDeviceToHost)); + // copy gpu_random_number_ei back to CPU if configure_for_test is on + if (configure_for_test) { + int mem_size_random_number_ei = num_iteration * kEINumThreads * kEINumBlocks * num_union * sizeof(*mu); + OL_CUDA_ERROR_RETURN(cudaMemcpy(random_number_ei, gpu_random_number_ei, mem_size_random_number_ei, cudaMemcpyDeviceToHost)); + } + // average ei_storage + double ave = 0.0; + for (int i = 0; i < (kEINumThreads*kEINumBlocks); ++i) { + ave += ei_storage[i]; + } + *ei_val = ave / static_cast(kEINumThreads*kEINumBlocks); + return _success; +} + +CudaError CudaGetGradEI(double * __restrict__ mu, double * __restrict__ chol_var, double * __restrict__ grad_mu, + double * __restrict__ grad_chol_var, int num_union, int num_to_sample, int dim, int num_mc, + uint64_t seed, double best, bool configure_for_test, double* __restrict__ random_number_grad_ei, + double * __restrict__ grad_ei, double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var, + double * __restrict__ gpu_grad_mu, double * __restrict__ gpu_grad_chol_var, + double* __restrict__ gpu_random_number_grad_ei, double * __restrict__ gpu_grad_ei_storage) { + CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__}; + + double grad_ei_storage[num_to_sample * dim * kGradEINumThreads * kGradEINumBlocks]; + std::fill(grad_ei, grad_ei + num_to_sample * dim, 0.0); + + // We assign kGradEINumBlocks blocks and kGradEINumThreads threads/block for grad_ei computation, + // so there are (kGradEINumBlocks * kGradEINumThreads) threads in total to execute kernel function + // in parallel + dim3 threads(kGradEINumThreads); + dim3 grid(kGradEINumBlocks); + int num_iteration = num_mc / (kGradEINumThreads * kGradEINumBlocks) + 1; // make sure num_iteration is always >= 1 + + int mem_size_mu = num_union * sizeof(*mu); + int mem_size_grad_mu = num_to_sample * dim * sizeof(*mu); + int mem_size_chol_var = num_union * num_union *sizeof(*mu); + int mem_size_grad_chol_var = num_to_sample * num_union * num_union * dim * sizeof(*mu); + int mem_size_grad_ei_storage= kGradEINumThreads * kGradEINumBlocks * num_to_sample * dim * sizeof(*mu); + + OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_mu, mu, mem_size_mu, cudaMemcpyHostToDevice)); + OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_grad_mu, grad_mu, mem_size_grad_mu, cudaMemcpyHostToDevice)); + OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_chol_var, chol_var, mem_size_chol_var, cudaMemcpyHostToDevice)); + OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_grad_chol_var, grad_chol_var, mem_size_grad_chol_var, cudaMemcpyHostToDevice)); + + // execute kernel + // inputs: gpu_mu, gpu_chol_var, gpu_grad_mu, gpu_grad_chol_var, best, num_union, num_to_sample, dim, num_iteration, seed + // output: gpu_grad_ei_storage + CudaComputeGradEIGpu <<< grid, threads, mem_size_mu+mem_size_chol_var+mem_size_grad_mu+mem_size_grad_chol_var >>> + (gpu_mu, gpu_chol_var, gpu_grad_mu, gpu_grad_chol_var, num_union, num_to_sample, dim, + num_iteration, best, seed, gpu_grad_ei_storage, gpu_random_number_grad_ei, configure_for_test); + OL_CUDA_ERROR_RETURN(cudaPeekAtLastError()); + + OL_CUDA_ERROR_RETURN(cudaMemcpy(grad_ei_storage, gpu_grad_ei_storage, mem_size_grad_ei_storage, cudaMemcpyDeviceToHost)); + // copy gpu_random_number_grad_ei back to CPU if configure_for_test is on + if (configure_for_test) { + int mem_size_random_number_grad_ei = num_iteration * kGradEINumThreads * kGradEINumBlocks * num_union * sizeof(*mu); + OL_CUDA_ERROR_RETURN(cudaMemcpy(random_number_grad_ei, gpu_random_number_grad_ei, mem_size_random_number_grad_ei, cudaMemcpyDeviceToHost)); + } + + // The code block below extracts grad_ei from grad_ei_storage, which is output from the function + // "CudaGetGradEI" run on gpu. The way to do that is for each component of grad_ei, we find all + // the threads calculating the corresponding component and average over the threads. + for (int n = 0; n < (kGradEINumThreads*kGradEINumBlocks); ++n) { + for (int i = 0; i < num_to_sample*dim; ++i) { + grad_ei[i] += grad_ei_storage[n*num_to_sample*dim + i]; + } + } + for (int i = 0; i < num_to_sample*dim; ++i) { + grad_ei[i] /= static_cast(kGradEINumThreads*kGradEINumBlocks); + } + return _success; +} + +CudaError CudaSetDevice(int devID) { + CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__}; + OL_CUDA_ERROR_RETURN(cudaSetDevice(devID)); + return _success; +} + +} // end namespace optimal_learning + diff --git a/moe/optimal_learning/cpp/gpu/gpp_cuda_math.hpp b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.hpp new file mode 100644 index 00000000..8561e584 --- /dev/null +++ b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.hpp @@ -0,0 +1,139 @@ +/*!\rst + \file gpp_cuda_math.hpp + \rst + This file contains declaration of gpu functions (host code) that are called by C++ code. The functions include calculating ExpectedImprovement, gradient of ExpectedImprovement, and gpu utility functions (memory allocation, setup gpu device, etc) +\endrst*/ + +#ifndef MOE_OPTIMAL_LEARNING_CPP_GPU_GPP_CUDA_MATH_HPP_ +#define MOE_OPTIMAL_LEARNING_CPP_GPU_GPP_CUDA_MATH_HPP_ + +#include + +#include "driver_types.h" + +namespace optimal_learning { + +//! Number of blocks assigned for computing Expected Improvement on GPU +static unsigned int kEINumBlocks = 32; +//! Number of threads per block assigned for computing Expected Improvement on GPU +static unsigned int kEINumThreads = 256; +//! Number of blocks assigned for computing Gradient of Expected Improvement on GPU +static unsigned int kGradEINumBlocks = 32; +//! Number of threads per block assigned for computing Gradient of Expected Improvement on GPU +static unsigned int kGradEINumThreads = 256; + +/*!\rst + This C struct contains error information that are used by exception handling in gpp_expected_improvement_gpu.hpp/cpp +\endrst*/ +struct CudaError { + //! error returned by CUDA API functions(basiclly enum type) + cudaError_t err; + //! file and line info of the function which returned error + char const * file_and_line_info; + //! name of the function that returned error + char const * func_info; +}; + +/*!\rst + Compute Expected Improvement by Monte-Carlo using GPU, and this function is only meant to be used by + CudaExpectedImprovementEvaluator::ComputeExpectedImprovement(...) in gpp_expected_improvement_gpu.hpp/cpp + \param + :mu[num_union]: the mean of the GP evaluated at points interested + :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested + :num_union: number of the points interested + :num_mc: number of iterations for Monte-Carlo simulation + :seed: seed for RNG + :best: best function evaluation obtained so far + :configure_for_test: whether record random_number_ei or not + :random_number_ei[num_union][num_iteration][num_threads][num_blocks]: random numbers used for + computing EI, for testing purpose only + :ei_val[1]: pointer to value of Expected Improvement + :gpu_mu[num_union]: pointer to memory storing mu on GPU + :gpu_chol_var[num_union][num_union]: pointer to memory storing chol_var on GPU + :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: pointer to memory storing + random numbers used for computing EI, for testing purpose only + :gpu_ei_storage[num_threads][num_blocks]: pointer to memory storing values of EI on GPU + \output + :ei_val[1]: value of Expected Improvement modified, and equals to computed value of EI + :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: pointer to memory storing + random numbers used for computing EI, for testing purpose only + :random_number_ei[num_union][num_iteration][num_threads][num_blocks]: random numbers used for + computing EI, for testing purpose only + \return + CudaError state, which contains error information, file name, line and function name of the function that occurs error +\endrst*/ +extern "C" CudaError CudaGetEI(double * __restrict__ mu, double * __restrict__ chol_var, int num_union, + int num_mc, uint64_t seed, double best, bool configure_for_test, + double* __restrict__ random_number_ei, double* __restrict__ ei_val, + double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var, + double* __restrict__ gpu_random_number_ei, double * __restrict__ gpu_ei_storage); + +/*!\rst + Compute Gradient of Expected Improvement by Monte-Carlo using GPU, and this function is only meant to be used by + CudaExpectedImprovementEvaluator::ComputeGradExpectedImprovement(...) in gpp_expected_improvement_gpu.hpp/cpp + \param + :mu[num_union]: the mean of the GP evaluated at points interested + :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested + :grad_mu[dim][num_to_sample]: the gradient of mean of the GP evaluated at points interested + :grad_chol_var[dim][num_union][num_union][num_to_sample]: gradient of cholesky factorization of + the GP variance evaluated at points interested + :num_union: number of the union of points (aka q+p) + :num_to_sample: number of points to sample (aka q) + :dim: dimension of point space + :num_mc: number of iterations for Monte-Carlo simulation + :seed: seed for RNG + :best: best function evaluation obtained so far + :configure_for_test: whether record random_number_grad_ei or not + :random_number_grad_ei[num_union][num_threads][num_blocks]: random numbers used for computing gradEI, + for testing purpose only + :grad_ei[dim][num_to_sample]: pointer to value of gradient of Expected Improvement + :gpu_mu[num_union]: pointer to memory storing mu on GPU + :gpu_chol_var[num_union][num_union]: pointer to memory storing chol_var on GPU + :gpu_grad_mu[dim][num_to_sample]: pointer to memory storing grad_mu on GPU + :gpu_grad_chol_var[dim][num_union][num_union][num_to_sample]: pointer to memory storing grad_chol_var on GPU + :gpu_random_number_grad_ei[num_union][num_threads][num_blocks]: pointer to memory storing random + numbers used for computing gradEI, for testing purpose only + :gpu_grad_ei_storage[dim][num_to_sample][num_threads][num_blocks]: pointer to memory storing values of gradient EI on GPU + \output + :random_number_grad_ei[num_union][num_threads][num_blocks]: random numbers used for computing gradEI, for testing purpose only + :grad_ei[dim][num_to_sample]: pointer to value of gradient of Expected Improvement + :gpu_random_number_grad_ei[num_union][num_threads][num_blocks]: pointer to memory storing random + numbers used for computing gradEI, for testing purpose only + \return + CudaError state, which contains error information, file name, line and function name of the function that occurs error +\endrst*/ +extern "C" CudaError CudaGetGradEI(double * __restrict__ mu, double * __restrict__ chol_var, double * __restrict__ grad_mu, + double * __restrict__ grad_chol_var, int num_union, int num_to_sample, int dim, int num_mc, + uint64_t seed, double best, bool configure_for_test, double* __restrict__ random_number_grad_ei, + double * __restrict__ grad_ei, double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var, + double * __restrict__ gpu_grad_mu, double * __restrict__ gpu_grad_chol_var, + double* __restrict__ gpu_random_number_grad_ei, double * __restrict__ gpu_grad_ei_storage); + +/*!\rst + Allocate GPU memory for storing an array. This is same as malloc in C, with error handling. + \param + :num_doubles: number of double numbers contained in the array + :address_of_ptr_to_gpu_memory: address of the pointer to memory on GPU + \return + CudaError state, which contains error information, file name, line and function name of the function that occurs error +\endrst*/ +extern "C" CudaError CudaAllocateMemForDoubleVector(int num_doubles, double** __restrict__ address_of_ptr_to_gpu_memory); + +/*!\rst + Free GPU memory, same as free() in C. + \param + :ptr_to_gpu_memory: pointer to memory on GPU to free +\endrst*/ +extern "C" void CudaFreeMem(double* __restrict__ ptr_to_gpu_memory); + +/*!\rst + Setup GPU device, and all GPU function calls will be operated on the GPU activated by this function. + \param + :devID: the ID of GPU device to setup + \return + CudaError state, which contains error information, file name, line and function name of the function that occurs error +\endrst*/ +extern "C" CudaError CudaSetDevice(int devID); + +} // end namespace optimal_learning +#endif // MOE_OPTIMAL_LEARNING_CPP_GPU_GPP_CUDA_MATH_HPP_