-
Notifications
You must be signed in to change notification settings - Fork 140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Jialei gh219 improve consistency check between gpu vs cpu computation for general qp ei #267
Changes from 44 commits
6f2a3d0
f5e5791
65e3636
cc35507
d90a1c7
3c67826
346b3b1
df9c1bc
2b4e0d0
1cc720c
2854aaf
cd66a21
0980fd8
ea8c417
77edcb3
a4880ff
895b371
2aa1a09
b53ba32
3009f2d
c5a06fc
42362f5
406929e
ef82b1e
0251b0d
4f15598
63fb267
2786617
c44a7f8
c68ed9f
ba82f53
510d208
68d86be
30496ee
e418390
9c3d8fb
7d5134b
b504363
4590675
8561e8b
3cfcae5
c92a45d
eb8db1d
e24bd22
02e9034
7ede2dd
918f7c2
8ab2d8a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,6 +73,7 @@ set(OPTIMAL_LEARNING_CORE_SOURCES | |
gpp_math.cpp | ||
gpp_model_selection.cpp | ||
gpp_random.cpp | ||
gpp_expected_improvement_gpu.cpp | ||
) | ||
|
||
# readonly | ||
|
@@ -88,6 +89,7 @@ set(OPTIMAL_LEARNING_TEST_SOURCES | |
gpp_random_test.cpp | ||
gpp_test_utils.cpp | ||
gpp_test_utils_test.cpp | ||
gpp_expected_improvement_gpu_test.cpp | ||
) | ||
|
||
# readonly | ||
|
@@ -228,6 +230,11 @@ function(configure_exec_targets exec_names exec_sources dependencies compile_fla | |
COMPILE_DEFINITIONS "${compile_definitions}" | ||
LINK_FLAGS "${EXTRA_LINK_FLAGS}" | ||
) | ||
if (${MOE_USE_GPU} MATCHES "1") | ||
add_dependencies(${name} GPU_LIB) | ||
target_link_libraries(${name} ${CUDA_LIBRARIES} | ||
${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so) | ||
endif() | ||
endforeach() | ||
endfunction(configure_exec_targets) | ||
|
||
|
@@ -258,6 +265,50 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fopenmp -Wall -Wextra ${EXTR | |
# O3 takes longer to compile and the code produced is at best no faster than O2 (gcc, icc). | ||
string(REGEX REPLACE "O3" "O2" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) | ||
|
||
#### GPU Component | ||
# readonly | ||
set(EXTRA_COMPILE_DEFINITIONS_GPU OL_GPU_ENABLED) | ||
|
||
# CUDA C compiler may be different from the c/c++ compiler used for compiling cpp | ||
# code, because current available CUDA version does not support gcc 4.7+ | ||
set(GPU_CC ${CMAKE_C_COMPILER}) | ||
set(GPU_CXX ${CMAKE_CXX_COMPILER}) | ||
|
||
if (EXISTS ${MOE_GPU_CC}) | ||
set(GPU_CC ${MOE_GPU_CC}) | ||
endif() | ||
if (EXISTS ${MOE_GPU_CXX}) | ||
set(GPU_CXX ${MOE_GPU_CXX}) | ||
endif() | ||
|
||
# If MOE_USE_GPU is turned on via MOE_CMAKE_OPTS, cmake will try to find CUDA | ||
# package and call gpu cmake to build CUDA code | ||
if (${MOE_USE_GPU} MATCHES "1") | ||
if (NOT (EXISTS ${MOE_CUDA_SDK_INCLUDE_DIRS})) | ||
message( FATAL_ERROR "MOE_CUDA_SDK_INCLUDE_DIRS not set!" ) | ||
endif() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this required to build w/cuda? i.e., cmake will never figure this out on its own? (e.g., we don't require that users specify the other MOE_FOO env vars, but they're there in case users want to override default behavior) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because CUDA SDK is an optional package for CUDA installation (it's a separate file from the main CUDA package for download in CUDA website), I read .cmake file and it only looks for include_dir for CUDA main package. However, all the error handling utilities are provided by CUDA SDK. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we have to add header files in CUDA SDK to include_dir as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, we should document these dependencies on the installation page. Like have a CUDA section where we list what cuda version, cuda compute capability is required, driver/software version, SDK versions, and whatever information is relevant/minimal. |
||
find_package(CUDA 5.0 REQUIRED) | ||
include_directories(${CUDA_INCLUDE_DIRS}) | ||
set(EXTRA_COMPILE_DEFINITIONS ${EXTRA_COMPILE_DEFINITIONS} | ||
${EXTRA_COMPILE_DEFINITIONS_GPU}) | ||
set(MOE_GPU_CMAKE_OPTS | ||
"-D MOE_CUDA_SDK_INCLUDE_DIRS=${MOE_CUDA_SDK_INCLUDE_DIRS}") | ||
add_custom_target( | ||
GPU_FOLDER | ||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR} | ||
COMMAND mkdir -p ${CMAKE_BINARY_DIR}/gpu/ | ||
) | ||
add_custom_target( | ||
GPU_LIB | ||
DEPENDS GPU_FOLDER | ||
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/gpu/ | ||
COMMAND CC=${GPU_CC} CXX=${GPU_CXX} cmake `echo | ||
${MOE_GPU_CMAKE_OPTS}` ${CMAKE_SOURCE_DIR}/gpu/ | ||
COMMAND make | ||
) | ||
endif() | ||
|
||
|
||
#### Object libraries | ||
# See configure_object_library() function comments for more details. | ||
# WARNING: You MUST have compatible flags set between OBJECT libraries and targets that depend on them! | ||
|
@@ -299,9 +350,15 @@ add_library( | |
$<TARGET_OBJECTS:OPTIMAL_LEARNING_TEST_BUNDLE> | ||
${CMAKE_BINARY_DIR}/__init__.py | ||
) | ||
if (${MOE_USE_GPU} MATCHES "1") | ||
add_dependencies(GPP GPU_LIB) | ||
endif() | ||
|
||
target_link_libraries(GPP ${PYTHON_LIBRARIES} ${Boost_LIBRARIES}) | ||
|
||
if (${MOE_USE_GPU} MATCHES "1") | ||
target_link_libraries(GPP ${CUDA_LIBRARIES} ${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so) | ||
endif() | ||
|
||
# Make sure lib ends in ".so" | ||
set_target_properties( | ||
GPP PROPERTIES | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
/*! | ||
\file gpp_expected_improvement_gpu.cpp | ||
\rst | ||
This file contains implementations of GPU related functions. They are actually C++ wrappers for | ||
CUDA C functions defined in gpu/gpp_cuda_math.cu. | ||
\endrst*/ | ||
|
||
#include "gpp_expected_improvement_gpu.hpp" | ||
|
||
#include <algorithm> | ||
#include <vector> | ||
|
||
#include "gpp_common.hpp" | ||
#include "gpp_exception.hpp" | ||
#include "gpp_logging.hpp" | ||
#include "gpp_math.hpp" | ||
#include "gpp_random.hpp" | ||
|
||
#ifdef OL_GPU_ENABLED | ||
|
||
#include "gpu/gpp_cuda_math.hpp" | ||
#include "driver_types.h" | ||
#include "cuda_runtime.h" | ||
|
||
#endif | ||
|
||
namespace optimal_learning { | ||
|
||
#ifdef OL_GPU_ENABLED | ||
|
||
inline int get_vector_size(int num_mc_itr, int num_threads, int num_blocks, int num_points) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FunctionNamesAreCamelCase There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, you don't need to mark functions that are defined in only one translation unit with 'inline'. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
return ((static_cast<int>(num_mc_itr / (num_threads * num_blocks)) + 1) * (num_threads * num_blocks) * num_points); | ||
} | ||
|
||
CudaDevicePointer::CudaDevicePointer(int num_doubles_in) : num_doubles(num_doubles_in) { | ||
if (num_doubles_in > 0) { | ||
CudaError _err = CudaAllocateMemForDoubleVector(num_doubles, &ptr); | ||
if (_err.err != cudaSuccess) { | ||
ptr = nullptr; | ||
ThrowException(OptimalLearningCudaException(_err)); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. on the CPU, if malloc fails, the ptr is set to null. that might happen on the GPU as well (and then you won't need to do it explicitly and can use your standard error macro). I'd double check that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping |
||
} else { | ||
ptr = nullptr; | ||
} | ||
} | ||
|
||
CudaDevicePointer::~CudaDevicePointer() { | ||
CudaFreeMem(ptr); | ||
} | ||
|
||
OptimalLearningCudaException::OptimalLearningCudaException(const CudaError& _err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 'regular' variables shouldn't be named with a leading underscore There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
: OptimalLearningException(_err.file_and_line_info, _err.func_info, cudaGetErrorString(_err.err)) {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. closing brace goes on its own line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
|
||
double CudaExpectedImprovementEvaluator::ComputeExpectedImprovement(StateType * ei_state) const { | ||
double EI_val; | ||
int num_union = ei_state->num_union; | ||
gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data()); | ||
gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data()); | ||
int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data()); | ||
if (unlikely(leading_minor_index != 0)) { | ||
OL_THROW_EXCEPTION(SingularMatrixException, "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.", ei_state->cholesky_to_sample_var.data(), | ||
num_union, leading_minor_index); | ||
} | ||
unsigned int seed_in = (ei_state->uniform_rng->GetEngine())(); | ||
OL_CUDA_ERROR_THROW(CudaGetEI(ei_state->to_sample_mean.data(), ei_state->cholesky_to_sample_var.data(), | ||
best_so_far_, num_union, ei_state->gpu_mu.ptr, ei_state->gpu_chol_var.ptr, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please adjust these indent levels so that the arguments all line up (here & elsewhere) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
ei_state->gpu_ei_storage.ptr, seed_in, num_mc_, &EI_val, | ||
ei_state->gpu_random_number_ei.ptr, ei_state->random_number_ei.data(), | ||
ei_state->configure_for_test)); | ||
return EI_val; | ||
} | ||
|
||
void CudaExpectedImprovementEvaluator::ComputeGradExpectedImprovement(StateType * ei_state, | ||
double * restrict grad_ei) const { | ||
if (ei_state->num_derivatives == 0) { | ||
OL_THROW_EXCEPTION(OptimalLearningException, "configure_for_gradients set to false, gradient computation is disabled!"); | ||
} | ||
const int num_union = ei_state->num_union; | ||
const int num_to_sample = ei_state->num_to_sample; | ||
gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data()); | ||
gaussian_process_->ComputeGradMeanOfPoints(ei_state->points_to_sample_state, ei_state->grad_mu.data()); | ||
gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data()); | ||
int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data()); | ||
if (unlikely(leading_minor_index != 0)) { | ||
OL_THROW_EXCEPTION(SingularMatrixException, "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.", ei_state->cholesky_to_sample_var.data(), | ||
num_union, leading_minor_index); | ||
} | ||
|
||
gaussian_process_->ComputeGradCholeskyVarianceOfPoints(&(ei_state->points_to_sample_state), | ||
ei_state->cholesky_to_sample_var.data(), | ||
ei_state->grad_chol_decomp.data()); | ||
unsigned int seed_in = (ei_state->uniform_rng->GetEngine())(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change "unsigned int" to "auto" (here and elsewhere). if we use a 64-bit RNG, auto will work seamlessly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. used uint64_t instead |
||
|
||
OL_CUDA_ERROR_THROW(CudaGetGradEI(ei_state->to_sample_mean.data(), ei_state->grad_mu.data(), | ||
ei_state->cholesky_to_sample_var.data(), ei_state->grad_chol_decomp.data(), | ||
best_so_far_, num_union, num_to_sample, dim_, | ||
(ei_state->gpu_mu).ptr, (ei_state->gpu_grad_mu).ptr, (ei_state->gpu_chol_var).ptr, | ||
(ei_state->gpu_grad_chol_var).ptr, (ei_state->gpu_grad_ei_storage).ptr, | ||
seed_in, num_mc_, grad_ei, ei_state->gpu_random_number_grad_ei.ptr, | ||
ei_state->random_number_grad_ei.data(), ei_state->configure_for_test)); | ||
} | ||
|
||
void CudaExpectedImprovementEvaluator::SetupGPU(int devID) { | ||
OL_CUDA_ERROR_THROW(CudaSetDevice(devID)); | ||
} | ||
|
||
CudaExpectedImprovementEvaluator::CudaExpectedImprovementEvaluator(const GaussianProcess& gaussian_process_in, | ||
int num_mc_in, double best_so_far, int devID_in) | ||
: dim_(gaussian_process_in.dim()), | ||
num_mc_(num_mc_in), | ||
best_so_far_(best_so_far), | ||
gaussian_process_(&gaussian_process_in) { | ||
SetupGPU(devID_in); | ||
} | ||
|
||
CudaExpectedImprovementEvaluator::~CudaExpectedImprovementEvaluator() { | ||
cudaDeviceReset(); | ||
} | ||
|
||
CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, | ||
double const * restrict points_to_sample, | ||
double const * restrict points_being_sampled, | ||
int num_to_sample_in, int num_being_sampled_in, | ||
bool configure_for_gradients, | ||
UniformRandomGenerator * uniform_rng_in) | ||
: dim(ei_evaluator.dim()), | ||
num_to_sample(num_to_sample_in), | ||
num_being_sampled(num_being_sampled_in), | ||
num_derivatives(configure_for_gradients ? num_to_sample : 0), | ||
num_union(num_to_sample + num_being_sampled), | ||
union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)), | ||
points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives), | ||
uniform_rng(uniform_rng_in), | ||
to_sample_mean(num_union), | ||
grad_mu(dim*num_derivatives), | ||
cholesky_to_sample_var(Square(num_union)), | ||
grad_chol_decomp(dim*Square(num_union)*num_derivatives), | ||
configure_for_test(false), | ||
gpu_mu(num_union), | ||
gpu_chol_var(Square(num_union)), | ||
gpu_grad_mu(dim * num_derivatives), | ||
gpu_grad_chol_var(dim * Square(num_union) * num_derivatives), | ||
gpu_ei_storage(kEINumThreads * kEINumBlocks), | ||
gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives), | ||
gpu_random_number_ei(0), | ||
gpu_random_number_grad_ei(0), | ||
random_number_ei(0), | ||
random_number_grad_ei(0) { | ||
} | ||
|
||
CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, | ||
double const * restrict points_to_sample, | ||
double const * restrict points_being_sampled, | ||
int num_to_sample_in, int num_being_sampled_in, | ||
bool configure_for_gradients, | ||
UniformRandomGenerator * uniform_rng_in, | ||
bool configure_for_test_in) | ||
: dim(ei_evaluator.dim()), | ||
num_to_sample(num_to_sample_in), | ||
num_being_sampled(num_being_sampled_in), | ||
num_derivatives(configure_for_gradients ? num_to_sample : 0), | ||
num_union(num_to_sample + num_being_sampled), | ||
union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)), | ||
points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives), | ||
uniform_rng(uniform_rng_in), | ||
to_sample_mean(num_union), | ||
grad_mu(dim*num_derivatives), | ||
cholesky_to_sample_var(Square(num_union)), | ||
grad_chol_decomp(dim*Square(num_union)*num_derivatives), | ||
configure_for_test(configure_for_test_in), | ||
gpu_mu(num_union), | ||
gpu_chol_var(Square(num_union)), | ||
gpu_grad_mu(dim * num_derivatives), | ||
gpu_grad_chol_var(dim * Square(num_union) * num_derivatives), | ||
gpu_ei_storage(kEINumThreads * kEINumBlocks), | ||
gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives), | ||
gpu_random_number_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0), | ||
gpu_random_number_grad_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0), | ||
random_number_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0), | ||
random_number_grad_ei(configure_for_test ? get_vector_size(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0) { | ||
} | ||
|
||
std::vector<double> CudaExpectedImprovementState::BuildUnionOfPoints(double const * restrict points_to_sample, double const * restrict points_being_sampled, | ||
int num_to_sample, int num_being_sampled, int dim) noexcept { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. indent level There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
std::vector<double> union_of_points(dim*(num_to_sample + num_being_sampled)); | ||
std::copy(points_to_sample, points_to_sample + dim*num_to_sample, union_of_points.data()); | ||
std::copy(points_being_sampled, points_being_sampled + dim*num_being_sampled, union_of_points.data() + dim*num_to_sample); | ||
return union_of_points; | ||
} | ||
|
||
void CudaExpectedImprovementState::UpdateCurrentPoint(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) { | ||
// update points_to_sample in union_of_points | ||
std::copy(points_to_sample, points_to_sample + num_to_sample*dim, union_of_points.data()); | ||
|
||
// evaluate derived quantities for the GP | ||
points_to_sample_state.SetupState(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives); | ||
} | ||
|
||
void CudaExpectedImprovementState::SetupState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) { | ||
// update quantities derived from points_to_sample | ||
UpdateCurrentPoint(ei_evaluator, points_to_sample); | ||
} | ||
#endif // OL_GPU_ENABLED | ||
|
||
} // end namespace optimal_learning | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd add a few comments here about:
but the cmake stuff is looking really clean/easy to use now, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed