diff --git a/moe/optimal_learning/cpp/CMakeLists.txt b/moe/optimal_learning/cpp/CMakeLists.txt
index 3affba17..d3df48ab 100644
--- a/moe/optimal_learning/cpp/CMakeLists.txt
+++ b/moe/optimal_learning/cpp/CMakeLists.txt
@@ -73,6 +73,7 @@ set(OPTIMAL_LEARNING_CORE_SOURCES
   gpp_math.cpp
   gpp_model_selection.cpp
   gpp_random.cpp
+  gpp_expected_improvement_gpu.cpp
   )
 
 # readonly
@@ -88,6 +89,7 @@ set(OPTIMAL_LEARNING_TEST_SOURCES
   gpp_random_test.cpp
   gpp_test_utils.cpp
   gpp_test_utils_test.cpp
+  gpp_expected_improvement_gpu_test.cpp
   )
 
 # readonly
@@ -228,6 +230,11 @@ function(configure_exec_targets exec_names exec_sources dependencies compile_fla
       COMPILE_DEFINITIONS "${compile_definitions}"
       LINK_FLAGS "${EXTRA_LINK_FLAGS}"
       )
+    if (${MOE_USE_GPU} MATCHES "1")
+        add_dependencies(${name} GPU_LIB)
+        target_link_libraries(${name} ${CUDA_LIBRARIES}
+            ${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so)
+    endif()
   endforeach()
 endfunction(configure_exec_targets)
 
@@ -258,6 +265,50 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fopenmp -Wall -Wextra ${EXTR
 # O3 takes longer to compile and the code produced is at best no faster than O2 (gcc, icc).
 string(REGEX REPLACE "O3" "O2" CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
 
+#### GPU Component
+# readonly
+set(EXTRA_COMPILE_DEFINITIONS_GPU OL_GPU_ENABLED)
+
+# CUDA C compiler may be different from the c/c++ compiler used for compiling cpp
+# code, because current available CUDA version does not support gcc 4.7+
+set(GPU_CC ${CMAKE_C_COMPILER})
+set(GPU_CXX ${CMAKE_CXX_COMPILER})
+
+if (EXISTS ${MOE_GPU_CC})
+    set(GPU_CC ${MOE_GPU_CC})
+endif()
+if (EXISTS ${MOE_GPU_CXX})
+    set(GPU_CXX ${MOE_GPU_CXX})
+endif()
+
+# If MOE_USE_GPU is turned on via MOE_CMAKE_OPTS, cmake will try to find CUDA
+# package and call gpu cmake to build CUDA code
+if (${MOE_USE_GPU} MATCHES "1")
+    if (NOT (EXISTS ${MOE_CUDA_SDK_INCLUDE_DIRS}))
+        message( FATAL_ERROR "MOE_CUDA_SDK_INCLUDE_DIRS not set!" )
+    endif()
+    find_package(CUDA 5.0 REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    set(EXTRA_COMPILE_DEFINITIONS ${EXTRA_COMPILE_DEFINITIONS}
+       ${EXTRA_COMPILE_DEFINITIONS_GPU})
+    set(MOE_GPU_CMAKE_OPTS
+        "-D MOE_CUDA_SDK_INCLUDE_DIRS=${MOE_CUDA_SDK_INCLUDE_DIRS}")
+    add_custom_target(
+        GPU_FOLDER
+        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+        COMMAND mkdir -p ${CMAKE_BINARY_DIR}/gpu/
+        )
+    add_custom_target(
+        GPU_LIB
+        DEPENDS GPU_FOLDER
+        WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/gpu/
+        COMMAND CC=${GPU_CC} CXX=${GPU_CXX} cmake `echo
+        ${MOE_GPU_CMAKE_OPTS}` ${CMAKE_SOURCE_DIR}/gpu/
+        COMMAND make
+        )
+endif()
+
+
 #### Object libraries
 # See configure_object_library() function comments for more details.
 # WARNING: You MUST have compatible flags set between OBJECT libraries and targets that depend on them!
@@ -299,9 +350,15 @@ add_library(
   $<TARGET_OBJECTS:OPTIMAL_LEARNING_TEST_BUNDLE>
   ${CMAKE_BINARY_DIR}/__init__.py
   )
+if (${MOE_USE_GPU} MATCHES "1")
+    add_dependencies(GPP GPU_LIB)
+endif()
 
 target_link_libraries(GPP ${PYTHON_LIBRARIES} ${Boost_LIBRARIES})
-
+if (${MOE_USE_GPU} MATCHES "1")
+    target_link_libraries(GPP ${CUDA_LIBRARIES} ${CMAKE_BINARY_DIR}/gpu/libOL_GPU.so)
+endif()
+        
 # Make sure lib ends in ".so"
 set_target_properties(
   GPP PROPERTIES
diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp
new file mode 100644
index 00000000..0f89c5e9
--- /dev/null
+++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.cpp
@@ -0,0 +1,211 @@
+/*!
+  \file gpp_expected_improvement_gpu.cpp
+  \rst
+  This file contains implementations of GPU related functions. They are actually C++ wrappers for
+  CUDA C functions defined in gpu/gpp_cuda_math.cu.
+\endrst*/
+
+#include "gpp_expected_improvement_gpu.hpp"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "gpp_common.hpp"
+#include "gpp_exception.hpp"
+#include "gpp_logging.hpp"
+#include "gpp_math.hpp"
+#include "gpp_random.hpp"
+
+#ifdef OL_GPU_ENABLED
+
+#include "gpu/gpp_cuda_math.hpp"
+#include "driver_types.h"
+#include "cuda_runtime.h"
+
+#endif
+
+namespace optimal_learning {
+
+#ifdef OL_GPU_ENABLED
+
+CudaDevicePointer::CudaDevicePointer(int num_doubles_in) : num_doubles(num_doubles_in) {
+  if (num_doubles_in > 0) {
+      CudaError error = CudaAllocateMemForDoubleVector(num_doubles, &ptr);
+      if (error.err != cudaSuccess) {
+          ptr = nullptr;
+          ThrowException(OptimalLearningCudaException(error));
+      }
+  } else {
+      ptr = nullptr;
+  }
+}
+
+CudaDevicePointer::~CudaDevicePointer() {
+    CudaFreeMem(ptr);
+}
+
+OptimalLearningCudaException::OptimalLearningCudaException(const CudaError& error)
+      : OptimalLearningException(error.file_and_line_info, error.func_info, cudaGetErrorString(error.err)) {
+}
+
+double CudaExpectedImprovementEvaluator::ComputeExpectedImprovement(StateType * ei_state) const {
+  double EI_val;
+  int num_union = ei_state->num_union;
+  gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data());
+  gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data());
+  int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data());
+  if (unlikely(leading_minor_index != 0)) {
+    OL_THROW_EXCEPTION(SingularMatrixException,
+                       "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.",
+                       ei_state->cholesky_to_sample_var.data(), num_union, leading_minor_index);
+  }
+  uint64_t seed_in = (ei_state->uniform_rng->GetEngine())();
+  OL_CUDA_ERROR_THROW(CudaGetEI(ei_state->to_sample_mean.data(), ei_state->cholesky_to_sample_var.data(),
+                                num_union, num_mc_, seed_in, best_so_far_, ei_state->configure_for_test,
+                                ei_state->random_number_ei.data(), &EI_val, ei_state->gpu_mu.ptr,
+                                ei_state->gpu_chol_var.ptr, ei_state->gpu_random_number_ei.ptr,
+                                ei_state->gpu_ei_storage.ptr));
+  return EI_val;
+}
+
+void CudaExpectedImprovementEvaluator::ComputeGradExpectedImprovement(StateType * ei_state,
+                                                                      double * restrict grad_ei) const {
+  if (ei_state->num_derivatives == 0) {
+    OL_THROW_EXCEPTION(OptimalLearningException, "configure_for_gradients set to false, gradient computation is disabled!");
+  }
+  const int num_union = ei_state->num_union;
+  const int num_to_sample = ei_state->num_to_sample;
+  gaussian_process_->ComputeMeanOfPoints(ei_state->points_to_sample_state, ei_state->to_sample_mean.data());
+  gaussian_process_->ComputeGradMeanOfPoints(ei_state->points_to_sample_state, ei_state->grad_mu.data());
+  gaussian_process_->ComputeVarianceOfPoints(&(ei_state->points_to_sample_state), ei_state->cholesky_to_sample_var.data());
+  int leading_minor_index = ComputeCholeskyFactorL(num_union, ei_state->cholesky_to_sample_var.data());
+  if (unlikely(leading_minor_index != 0)) {
+    OL_THROW_EXCEPTION(SingularMatrixException,
+                       "GP-Variance matrix singular. Check for duplicate points_to_sample/being_sampled or points_to_sample/being_sampled duplicating points_sampled with 0 noise.",
+                       ei_state->cholesky_to_sample_var.data(), num_union, leading_minor_index);
+  }
+
+  gaussian_process_->ComputeGradCholeskyVarianceOfPoints(&(ei_state->points_to_sample_state),
+                                                         ei_state->cholesky_to_sample_var.data(),
+                                                         ei_state->grad_chol_decomp.data());
+  uint64_t seed_in = (ei_state->uniform_rng->GetEngine())();
+
+  OL_CUDA_ERROR_THROW(CudaGetGradEI(ei_state->to_sample_mean.data(), ei_state->cholesky_to_sample_var.data(),
+                                    ei_state->grad_mu.data(), ei_state->grad_chol_decomp.data(), num_union,
+                                    num_to_sample, dim_, num_mc_, seed_in, best_so_far_, ei_state->configure_for_test,
+                                    ei_state->random_number_grad_ei.data(), grad_ei, ei_state->gpu_mu.ptr,
+                                    ei_state->gpu_chol_var.ptr, ei_state->gpu_grad_mu.ptr, ei_state->gpu_grad_chol_var.ptr,
+                                    ei_state->gpu_random_number_grad_ei.ptr, ei_state->gpu_grad_ei_storage.ptr));
+}
+
+void CudaExpectedImprovementEvaluator::SetupGPU(int devID) {
+  OL_CUDA_ERROR_THROW(CudaSetDevice(devID));
+}
+
+CudaExpectedImprovementEvaluator::CudaExpectedImprovementEvaluator(const GaussianProcess& gaussian_process_in,
+                                                                   int num_mc_in, double best_so_far, int devID_in)
+      : dim_(gaussian_process_in.dim()),
+        num_mc_(num_mc_in),
+        best_so_far_(best_so_far),
+        gaussian_process_(&gaussian_process_in) {
+    SetupGPU(devID_in);
+  }
+
+CudaExpectedImprovementEvaluator::~CudaExpectedImprovementEvaluator() {
+  cudaDeviceReset();
+}
+
+CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator,
+                                                           double const * restrict points_to_sample,
+                                                           double const * restrict points_being_sampled,
+                                                           int num_to_sample_in, int num_being_sampled_in,
+                                                           bool configure_for_gradients,
+                                                           UniformRandomGenerator * uniform_rng_in)
+    : dim(ei_evaluator.dim()),
+      num_to_sample(num_to_sample_in),
+      num_being_sampled(num_being_sampled_in),
+      num_derivatives(configure_for_gradients ? num_to_sample : 0),
+      num_union(num_to_sample + num_being_sampled),
+      union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)),
+      points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives),
+      uniform_rng(uniform_rng_in),
+      to_sample_mean(num_union),
+      grad_mu(dim*num_derivatives),
+      cholesky_to_sample_var(Square(num_union)),
+      grad_chol_decomp(dim*Square(num_union)*num_derivatives),
+      configure_for_test(false),
+      gpu_mu(num_union),
+      gpu_chol_var(Square(num_union)),
+      gpu_grad_mu(dim * num_derivatives),
+      gpu_grad_chol_var(dim * Square(num_union) * num_derivatives),
+      gpu_ei_storage(kEINumThreads * kEINumBlocks),
+      gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives),
+      gpu_random_number_ei(0),
+      gpu_random_number_grad_ei(0),
+      random_number_ei(0),
+      random_number_grad_ei(0) {
+}
+
+CudaExpectedImprovementState::CudaExpectedImprovementState(const EvaluatorType& ei_evaluator,
+                                                           double const * restrict points_to_sample,
+                                                           double const * restrict points_being_sampled,
+                                                           int num_to_sample_in, int num_being_sampled_in,
+                                                           bool configure_for_gradients,
+                                                           UniformRandomGenerator * uniform_rng_in,
+                                                           bool configure_for_test_in)
+    : dim(ei_evaluator.dim()),
+      num_to_sample(num_to_sample_in),
+      num_being_sampled(num_being_sampled_in),
+      num_derivatives(configure_for_gradients ? num_to_sample : 0),
+      num_union(num_to_sample + num_being_sampled),
+      union_of_points(BuildUnionOfPoints(points_to_sample, points_being_sampled, num_to_sample, num_being_sampled, dim)),
+      points_to_sample_state(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives),
+      uniform_rng(uniform_rng_in),
+      to_sample_mean(num_union),
+      grad_mu(dim*num_derivatives),
+      cholesky_to_sample_var(Square(num_union)),
+      grad_chol_decomp(dim*Square(num_union)*num_derivatives),
+      configure_for_test(configure_for_test_in),
+      gpu_mu(num_union),
+      gpu_chol_var(Square(num_union)),
+      gpu_grad_mu(dim * num_derivatives),
+      gpu_grad_chol_var(dim * Square(num_union) * num_derivatives),
+      gpu_ei_storage(kEINumThreads * kEINumBlocks),
+      gpu_grad_ei_storage(kGradEINumThreads * kGradEINumBlocks * dim * num_derivatives),
+      gpu_random_number_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0),
+      gpu_random_number_grad_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0),
+      random_number_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kEINumThreads, kEINumBlocks, num_union) : 0),
+      random_number_grad_ei(configure_for_test ? GetVectorSize(ei_evaluator.num_mc(), kGradEINumThreads, kGradEINumBlocks, num_union) : 0) {
+}
+
+std::vector<double> CudaExpectedImprovementState::BuildUnionOfPoints(double const * restrict points_to_sample,
+                                                                     double const * restrict points_being_sampled,
+                                                                     int num_to_sample, int num_being_sampled, int dim) noexcept {
+  std::vector<double> union_of_points(dim*(num_to_sample + num_being_sampled));
+  std::copy(points_to_sample, points_to_sample + dim*num_to_sample, union_of_points.data());
+  std::copy(points_being_sampled, points_being_sampled + dim*num_being_sampled, union_of_points.data() + dim*num_to_sample);
+  return union_of_points;
+}
+
+int CudaExpectedImprovementState::GetVectorSize(int num_mc_itr, int num_threads, int num_blocks, int num_points) noexcept {
+  return ((static_cast<int>(num_mc_itr / (num_threads * num_blocks)) + 1) * (num_threads * num_blocks) * num_points);
+}
+
+void CudaExpectedImprovementState::UpdateCurrentPoint(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) {
+  // update points_to_sample in union_of_points
+  std::copy(points_to_sample, points_to_sample + num_to_sample*dim, union_of_points.data());
+
+  // evaluate derived quantities for the GP
+  points_to_sample_state.SetupState(*ei_evaluator.gaussian_process(), union_of_points.data(), num_union, num_derivatives);
+}
+
+void CudaExpectedImprovementState::SetupState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) {
+  // update quantities derived from points_to_sample
+  UpdateCurrentPoint(ei_evaluator, points_to_sample);
+}
+#endif  // OL_GPU_ENABLED
+
+}  // end namespace optimal_learning
+
diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.hpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.hpp
new file mode 100644
index 00000000..121fe513
--- /dev/null
+++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu.hpp
@@ -0,0 +1,335 @@
+/*!
+  \file gpp_expected_improvement_gpu.hpp
+  \rst
+  All GPU related functions are declared here, and any other C++ functions who wish to call GPU functions should only call functions here.
+\endrst*/
+
+#ifndef MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_HPP_
+#define MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_HPP_
+
+#include <algorithm>
+#include <vector>
+
+#include "gpp_common.hpp"
+#include "gpp_exception.hpp"
+#include "gpp_logging.hpp"
+#include "gpp_math.hpp"
+#include "gpp_random.hpp"
+
+#ifdef OL_GPU_ENABLED
+
+#include "gpu/gpp_cuda_math.hpp"
+/*!\rst
+  Macro that checks error message (CudaError object) returned by CUDA functions, and throws
+  OptimalLearningCudaException if there is error.
+\endrst*/
+#define OL_CUDA_ERROR_THROW(X) do {CudaError _ERR = (X); if ((_ERR).err != cudaSuccess) {ThrowException(OptimalLearningCudaException(_ERR));}} while (0)
+
+#endif
+
+namespace optimal_learning {
+#ifdef OL_GPU_ENABLED
+
+/*!\rst
+  This struct does the same job as C++ smart pointer. It contains pointer to memory location on
+  GPU, its constructor and destructor also take care of memory allocation/deallocation on GPU. 
+\endrst*/
+struct CudaDevicePointer final {
+  explicit CudaDevicePointer(int num_doubles_in);
+
+  ~CudaDevicePointer();
+
+  //! pointer to the memory location on gpu
+  double* ptr;
+  //! number of doubles to allocate on gpu, so the memory size is num_doubles * sizeof(double)
+  int num_doubles;
+
+  OL_DISALLOW_DEFAULT_AND_COPY_AND_ASSIGN(CudaDevicePointer);
+};
+
+/*!\rst
+  Exception to handle runtime errors returned by CUDA API functions. This class subclasses
+  OptimalLearningException in gpp_exception.hpp/cpp, and basiclly has the same functionality
+  as its superclass, except the constructor is different.
+\endrst*/
+class OptimalLearningCudaException : public OptimalLearningException {
+ public:
+  //! String name of this exception ofr logging.
+  constexpr static char const * kName = "OptimalLearningCudaException";
+
+  /*!\rst
+    Constructs a OptimalLearningCudaException with struct CudaError
+    \param
+      :error: C struct that contains error message returned by CUDA API functions
+  \endrst*/
+  explicit OptimalLearningCudaException(const CudaError& error);
+
+  OL_DISALLOW_DEFAULT_AND_ASSIGN(OptimalLearningCudaException);
+};
+
+struct CudaExpectedImprovementState;
+
+/*!\rst
+  This class has the same functionality as ExpectedImprovementEvaluator (see gpp_math.hpp),
+  except that computations are performed on GPU.
+\endrst*/
+class CudaExpectedImprovementEvaluator final {
+ public:
+  using StateType = CudaExpectedImprovementState;
+  /*!\rst
+    Constructor that also specify which gpu you want to use (for multi-gpu system)
+  \endrst*/
+  CudaExpectedImprovementEvaluator(const GaussianProcess& gaussian_process_in,
+                                   int num_mc_in, double best_so_far, int devID_in);
+
+  ~CudaExpectedImprovementEvaluator();
+
+  int dim() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT {
+    return dim_;
+  }
+
+  int num_mc() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT {
+    return num_mc_;
+  }
+
+  const GaussianProcess * gaussian_process() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT {
+    return gaussian_process_;
+  }
+
+  /*!\rst
+    Wrapper for ComputeExpectedImprovement(); see that function for details.
+  \endrst*/
+  double ComputeObjectiveFunction(StateType * ei_state) const OL_NONNULL_POINTERS OL_WARN_UNUSED_RESULT {
+    return ComputeExpectedImprovement(ei_state);
+  }
+
+  /*!\rst
+    Wrapper for ComputeGradExpectedImprovement(); see that function for details.
+  \endrst*/
+  void ComputeGradObjectiveFunction(StateType * ei_state, double * restrict grad_ei) const OL_NONNULL_POINTERS {
+    ComputeGradExpectedImprovement(ei_state, grad_ei);
+  }
+
+  /*!\rst
+    This function has the same functionality as ComputeExpectedImprovement (see gpp_math.hpp)
+    in class ExpectedImprovementEvaluator.
+    \param
+      :ei_state[1]: properly configured state object
+    \output
+      :ei_state[1]: state with temporary storage modified; ``uniform_rng`` modified
+    \return
+      the expected improvement from sampling ``points_to_sample`` with ``points_being_sampled`` concurrent experiments
+  \endrst*/
+  double ComputeExpectedImprovement(StateType * ei_state) const OL_NONNULL_POINTERS OL_WARN_UNUSED_RESULT;
+
+  /*!\rst
+    This function has the same functionality as ComputeGradExpectedImprovement (see gpp_math.hpp)
+    in class ExpectedImprovementEvaluator.
+    \param
+      :ei_state[1]: properly configured state object
+    \output
+      :ei_state[1]: state with temporary storage modified; ``uniform_rng`` modified
+      :grad_ei[dim][num_to_sample]: gradient of EI
+  \endrst*/
+  void ComputeGradExpectedImprovement(StateType * ei_state, double * restrict grad_ei) const OL_NONNULL_POINTERS;
+
+  /*!\rst
+    Call CUDA API function to activate a GPU.
+    Refer to: http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g418c299b069c4803bfb7cab4943da383
+
+    \param
+      :devID: device ID of the GPU need to be activated
+  \endrst*/
+  void SetupGPU(int devID);
+
+  OL_DISALLOW_DEFAULT_AND_COPY_AND_ASSIGN(CudaExpectedImprovementEvaluator);
+
+ private:
+  //! spatial dimension (e.g., entries per point of points_sampled)
+  const int dim_;
+  //! number of mc iterations
+  int num_mc_;
+  //! best (minimum) objective function value (in points_sampled_value)
+  double best_so_far_;
+  //! pointer to gaussian process used in EI computations
+  const GaussianProcess * gaussian_process_;
+};
+
+/*!\rst
+  This has the same functionality as ExpectedImprovementState (see gpp_math.hpp) except that it is for GPU computing
+\endrst*/
+struct CudaExpectedImprovementState final {
+  using EvaluatorType = CudaExpectedImprovementEvaluator;
+
+  /*!\rst
+    Constructs an CudaExpectedImprovementState object with a specified source of randomness for
+    the purpose of computing EI(and its gradient) over the specified set of points to sample.
+    This establishes properly sized/initialized temporaries for EI computation, including dependent
+    state from the associated Gaussian Process (which arrives as part of the ei_evaluator).
+
+    .. WARNING:: This object is invalidated if the associated ei_evaluator is mutated.  SetupState()
+    should be called to reset.
+
+    .. WARNING::
+         Using this object to compute gradients when ``configure_for_gradients`` := false results in
+         UNDEFINED BEHAVIOR.
+
+    \param
+      :ei_evaluator: expected improvement evaluator object that specifies the parameters & GP for EI evaluation
+      :points_to_sample[dim][num_to_sample]: points at which to evaluate EI and/or its gradient to check their value in future experiments (i.e., test points for GP predictions)
+      :points_being_sampled[dim][num_being_sampled]: points being sampled in concurrent experiments
+      :num_to_sample: number of potential future samples; gradients are evaluated wrt these points (i.e., the "q" in q,p-EI)
+      :num_being_sampled: number of points being sampled in concurrent experiments (i.e., the "p" in q,p-EI)
+      :configure_for_gradients: true if this object will be used to compute gradients, false otherwise
+      :uniform_rng[1]: pointer to a properly initialized* UniformRandomGenerator object
+
+    .. NOTE::
+         * The UniformRandomGenerator object must already be seeded.  If multithreaded computation is used for EI, then every state object
+         must have a different UniformRandomGenerator (different seeds, not just different objects).
+  \endrst*/
+  CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample,
+                               double const * restrict points_being_sampled, int num_to_sample_in,
+                               int num_being_sampled_in, bool configure_for_gradients,
+                               UniformRandomGenerator* uniform_rng_in);
+
+  // constructor for setting up unit test
+  CudaExpectedImprovementState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample,
+                               double const * restrict points_being_sampled, int num_to_sample_in,
+                               int num_being_sampled_in, bool configure_for_gradients,
+                               UniformRandomGenerator * uniform_rng_in, bool configure_for_test);
+
+  CudaExpectedImprovementState(CudaExpectedImprovementState&& OL_UNUSED(other)) = default;
+
+  /*!\rst
+    Create a vector with the union of points_to_sample and points_being_sampled (the latter is appended to the former).
+
+    Note the l-value return. Assigning the return to a std::vector<double> or passing it as an argument to the ctor
+    will result in copy-elision or move semantics; no copying/performance loss.
+
+    \param:
+      :points_to_sample[dim][num_to_sample]: points at which to evaluate EI and/or its gradient to check their value in future experiments (i.e., test points for GP predictions)
+      :points_being_sampled[dim][num_being_sampled]: points being sampled in concurrent experiments
+      :num_to_sample: number of potential future samples; gradients are evaluated wrt these points (i.e., the "q" in q,p-EI)
+      :num_being_sampled: number of points being sampled in concurrent experiments (i.e., the "p" in q,p-EI)
+      :dim: the number of spatial dimensions of each point array
+    \return
+      std::vector<double> with the union of the input arrays: points_being_sampled is *appended* to points_to_sample
+  \endrst*/
+  static std::vector<double> BuildUnionOfPoints(double const * restrict points_to_sample,
+                                                double const * restrict points_being_sampled,
+                                                int num_to_sample, int num_being_sampled, int dim)
+                                                noexcept OL_WARN_UNUSED_RESULT;
+
+  /*!\rst
+    A simple utility function to calculate how many random numbers will be generated by GPU computation of EI/gradEI given
+    number of MC simulations. (user set num_mc_itr is not necessarily equal to the actual num_mc_itr used in GPU computation,
+    because actual num_mc_itr has to be multiple of (num_threads * num_blocks)
+
+    \param:
+      :num_mc_itr: number of MC simulations
+      :num_threads: number of threads per block in GPU computation
+      :num_blocks: number of blocks in GPU computation
+      :num_points: number of points interested (aka q+p)
+    \return
+      int: number of random numbers generated in GPU computation
+  \endrst*/
+  static int GetVectorSize(int num_mc_itr, int num_threads, int num_blocks, int num_points) noexcept OL_WARN_UNUSED_RESULT;
+
+  int GetProblemSize() const noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT {
+    return dim*num_to_sample;
+  }
+
+  /*!\rst
+    Get the ``points_to_sample``: potential future samples whose EI (and/or gradients) are being evaluated
+
+    \output
+      :points_to_sample[dim][num_to_sample]: potential future samples whose EI (and/or gradients) are being evaluated
+  \endrst*/
+  void GetCurrentPoint(double * restrict points_to_sample) const noexcept OL_NONNULL_POINTERS {
+    std::copy(union_of_points.data(), union_of_points.data() + num_to_sample*dim, points_to_sample);
+  }
+
+  /*!\rst
+    Change the potential samples whose EI (and/or gradient) are being evaluated.
+    Update the state's derived quantities to be consistent with the new points.
+
+    \param
+      :ei_evaluator: expected improvement evaluator object that specifies the parameters & GP for EI evaluation
+      :points_to_sample[dim][num_to_sample]: potential future samples whose EI (and/or gradients) are being evaluated
+  \endrst*/
+  void UpdateCurrentPoint(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) OL_NONNULL_POINTERS;
+
+  /*!\rst
+    Configures this state object with new ``points_to_sample``, the location of the potential samples whose EI is to be evaluated.
+    Ensures all state variables & temporaries are properly sized.
+    Properly sets all dependent state variables (e.g., GaussianProcess's state) for EI evaluation.
+
+    .. WARNING::
+         This object's state is INVALIDATED if the ``ei_evaluator`` (including the GaussianProcess it depends on) used in
+         SetupState is mutated! SetupState() should be called again in such a situation.
+
+    \param
+      :ei_evaluator: expected improvement evaluator object that specifies the parameters & GP for EI evaluation
+      :points_to_sample[dim][num_to_sample]: potential future samples whose EI (and/or gradients) are being evaluated
+  \endrst*/
+  void SetupState(const EvaluatorType& ei_evaluator, double const * restrict points_to_sample) OL_NONNULL_POINTERS;
+
+  // size information
+  //! spatial dimension (e.g., entries per point of ``points_sampled``)
+  const int dim;
+  //! number of potential future samples; gradients are evaluated wrt these points (i.e., the "q" in q,p-EI)
+  const int num_to_sample;
+  //! number of points being sampled concurrently (i.e., the "p" in q,p-EI)
+  const int num_being_sampled;
+  //! number of derivative terms desired (usually 0 for no derivatives or num_to_sample)
+  const int num_derivatives;
+  //! number of points in union_of_points: num_to_sample + num_being_sampled
+  const int num_union;
+
+  //! points currently being sampled; this is the union of the points represented by "q" and "p" in q,p-EI
+  //! ``points_to_sample`` is stored first in memory, immediately followed by ``points_being_sampled``
+  std::vector<double> union_of_points;
+
+  //! gaussian process state
+  GaussianProcess::StateType points_to_sample_state;
+
+  //! random number generator
+  UniformRandomGenerator* uniform_rng;
+
+  // temporary storage: preallocated space used by CudaExpectedImprovementEvaluator's member functions
+  //! the mean of the GP evaluated at union_of_points
+  std::vector<double> to_sample_mean;
+  //! the gradient of the GP mean evaluated at union_of_points, wrt union_of_points[0:num_to_sample]
+  std::vector<double> grad_mu;
+  //! the cholesky (``LL^T``) factorization of the GP variance evaluated at union_of_points
+  std::vector<double> cholesky_to_sample_var;
+  //! the gradient of the cholesky (``LL^T``) factorization of the GP variance evaluated at union_of_points wrt union_of_points[0:num_to_sample]
+  std::vector<double> grad_chol_decomp;
+
+  bool configure_for_test;
+  //! structs containing pointers to store the memory locations of variables on GPU
+  //! input data for GPU computations and GPU should not modify them
+  CudaDevicePointer gpu_mu;
+  CudaDevicePointer gpu_chol_var;
+  CudaDevicePointer gpu_grad_mu;
+  CudaDevicePointer gpu_grad_chol_var;
+  //! data containing results returned by GPU computations
+  CudaDevicePointer gpu_ei_storage;
+  CudaDevicePointer gpu_grad_ei_storage;
+  //! data containing random numbers used in GPU computations, which are only
+  //! used for testing
+  CudaDevicePointer gpu_random_number_ei;
+  CudaDevicePointer gpu_random_number_grad_ei;
+
+  //! storage for random numbers used in computing EI & grad_ei, this is only used to setup unit test
+  std::vector<double> random_number_ei;
+  std::vector<double> random_number_grad_ei;
+
+  OL_DISALLOW_DEFAULT_AND_COPY_AND_ASSIGN(CudaExpectedImprovementState);
+};
+
+#endif  // OL_GPU_ENABLED
+
+}   // end namespace optimal_learning
+
+#endif  // MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_HPP_
diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp
new file mode 100644
index 00000000..8deaba72
--- /dev/null
+++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.cpp
@@ -0,0 +1,262 @@
+/*!
+  \file gpp_expected_improvement_gpu_test.cpp
+  \rst
+  Routines to test the functions in gpp_expected_improvement_gpu.cpp.
+
+  The tests verify ExpectedImprovementGPUEvaluator from gpp_expected_improvement_gpu.cpp.
+
+  1. Monte-Carlo EI vs analytic EI validation: the monte-carlo versions are run to "high" accuracy and checked against
+     analytic formulae when applicable
+
+  2. GPU EI vs CPU EI: both use monte-carlo version and run consistency check on various random sample points
+\endrst*/
+
+#include "gpp_expected_improvement_gpu_test.hpp"
+
+#include <vector>
+#include <algorithm>
+
+#include <boost/random/uniform_real.hpp>  // NOLINT(build/include_order)
+
+#include "gpp_common.hpp"
+#include "gpp_exception.hpp"
+#include "gpp_expected_improvement_gpu.hpp"
+#include "gpp_math.hpp"
+#include "gpp_random.hpp"
+#include "gpp_test_utils.hpp"
+
+namespace optimal_learning {
+
+#ifdef OL_GPU_ENABLED
+
+namespace {
+/*!\rst
+  Test that the EI + grad EI computation (using MC integration) is consistent
+  with the special analytic case of EI when there is only *ONE* potential point
+  to sample.
+
+  Generates a set of 40 random test cases for expected improvement with only one potential sample.
+  The general EI (which uses MC integration) is evaluated to reasonably high accuracy (while not taking too long to run)
+  and compared against the analytic formula version for consistency.  The gradients (spatial) of EI are also checked.
+
+  \return
+    number of cases where analytic and monte-carlo EI do not match
+\endrst*/
+int RunCudaEIConsistencyTests() {
+  int total_errors = 0;
+
+  int which_gpu = 1;
+  const int num_mc_iter = 20000000;
+  const int dim = 3;
+  const int num_being_sampled = 0;
+  const int num_to_sample = 1;
+  const int num_sampled = 7;
+
+  double alpha = 2.80723;
+  // set best_so_far to be larger than max(points_sampled_value) (but don't make it huge or stability will be suffer)
+  double best_so_far = 10.0;
+  bool configure_for_gradients = true;
+
+  UniformRandomGenerator uniform_generator(31278);
+  boost::uniform_real<double> uniform_double(0.5, 2.5);
+
+  MockExpectedImprovementEnvironment ei_environment;
+
+  std::vector<double> lengths(dim);
+  std::vector<double> noise_variance(num_sampled, 0.0);
+  std::vector<double> grad_ei_cuda(dim);
+  std::vector<double> grad_ei_one_potential_sample(dim);
+  double ei_cuda;
+  double ei_one_potential_sample;
+
+  for (int i = 0; i < 40; ++i) {
+    ei_environment.Initialize(dim, num_to_sample, num_being_sampled, num_sampled, &uniform_generator);
+    for (int j = 0; j < dim; ++j) {
+      lengths[j] = uniform_double(uniform_generator.engine);
+    }
+    SquareExponential sqexp_covariance(dim, alpha, lengths);
+    GaussianProcess gaussian_process(sqexp_covariance, ei_environment.points_sampled(), ei_environment.points_sampled_value(), noise_variance.data(), dim, num_sampled);
+
+    OnePotentialSampleExpectedImprovementEvaluator one_potential_sample_ei_evaluator(gaussian_process, best_so_far);
+    OnePotentialSampleExpectedImprovementEvaluator::StateType one_potential_sample_ei_state(one_potential_sample_ei_evaluator, ei_environment.points_to_sample(), configure_for_gradients);
+
+    CudaExpectedImprovementEvaluator cuda_ei_evaluator(gaussian_process, num_mc_iter, best_so_far, which_gpu);
+    CudaExpectedImprovementEvaluator::StateType cuda_ei_state(cuda_ei_evaluator, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, configure_for_gradients, &uniform_generator);
+
+    ei_cuda = cuda_ei_evaluator.ComputeObjectiveFunction(&cuda_ei_state);
+    cuda_ei_evaluator.ComputeGradObjectiveFunction(&cuda_ei_state, grad_ei_cuda.data());
+    ei_one_potential_sample = one_potential_sample_ei_evaluator.ComputeObjectiveFunction(&one_potential_sample_ei_state);
+    one_potential_sample_ei_evaluator.ComputeGradObjectiveFunction(&one_potential_sample_ei_state, grad_ei_one_potential_sample.data());
+
+    int ei_errors_this_iteration = 0;
+    if (!CheckDoubleWithinRelative(ei_cuda, ei_one_potential_sample, 5.0e-3)) {
+      ++ei_errors_this_iteration;
+    }
+    if (ei_errors_this_iteration != 0) {
+      OL_PARTIAL_FAILURE_PRINTF("in EI on iteration %d\n", i);
+    }
+    total_errors += ei_errors_this_iteration;
+
+    int grad_ei_errors_this_iteration = 0;
+    for (int j = 0; j < dim; ++j) {
+      if (!CheckDoubleWithinRelative(grad_ei_cuda[j], grad_ei_one_potential_sample[j], 4.5e-3)) {
+        ++grad_ei_errors_this_iteration;
+      }
+    }
+
+    if (grad_ei_errors_this_iteration != 0) {
+      OL_PARTIAL_FAILURE_PRINTF("in EI gradients on iteration %d\n", i);
+    }
+    total_errors += grad_ei_errors_this_iteration;
+  }
+
+  if (total_errors != 0) {
+    OL_PARTIAL_FAILURE_PRINTF("comparing MC EI to analytic EI failed with %d total_errors\n\n", total_errors);
+  } else {
+    OL_PARTIAL_SUCCESS_PRINTF("comparing MC EI to analytic EI passed\n");
+  }
+  return total_errors;
+}
+
+/*!\rst
+  Tests that the general EI + grad EI computation on CPU (using MC integration) is consistent
+  with the computation on GPU. We use exactly the same sequences of normal random numbers on
+  CPU and GPU so that they are supposed to output the same result even if the number of MC
+  iterations is small.
+
+  Generates a set of 10 random test cases for genral q,p-EI computed on cpu vs gpu.
+  The computations on cpu and gpu use the same set of normal random numbers for MC
+  simulation, so that we can make sure the outputs should be consistent, even with
+  a relative small number of MC iteration.
+
+  \return
+    number of cases where outputs from cpu and gpu do not match.
+\endrst*/
+int RunCudaEIvsCpuEITests() {
+  int total_errors = 0;
+
+  int which_gpu = 1;
+  const int num_mc_iter = 40000;
+  const int dim = 3;
+  const int num_being_sampled = 4;
+  const int num_to_sample = 4;
+  const int num_sampled = 20;
+
+  double alpha = 2.80723;
+  // set best_so_far to be larger than max(points_sampled_value) (but don't make it huge or stability will be suffer)
+  double best_so_far = 10.0;
+  bool configure_for_gradients = true;
+  bool configure_for_test = true;
+
+  UniformRandomGenerator uniform_generator(31278);
+  boost::uniform_real<double> uniform_double(0.5, 2.5);
+
+  MockExpectedImprovementEnvironment ei_environment;
+
+  std::vector<double> lengths(dim);
+  std::vector<double> noise_variance(num_sampled, 0.0);
+  std::vector<double> grad_ei_cpu(dim*num_to_sample);
+  std::vector<double> grad_ei_gpu(dim*num_to_sample);
+  std::vector<double> normal_random_table;
+  double ei_cpu;
+  double ei_gpu;
+  int cpu_num_iter;
+
+  for (int i = 0; i < 10; ++i) {
+    ei_environment.Initialize(dim, num_to_sample, num_being_sampled, num_sampled, &uniform_generator);
+    for (int j = 0; j < dim; ++j) {
+      lengths[j] = uniform_double(uniform_generator.engine);
+    }
+    SquareExponential sqexp_covariance(dim, alpha, lengths);
+    GaussianProcess gaussian_process(sqexp_covariance, ei_environment.points_sampled(), ei_environment.points_sampled_value(), noise_variance.data(), dim, num_sampled);
+
+    CudaExpectedImprovementEvaluator cuda_ei_evaluator(gaussian_process, num_mc_iter, best_so_far, which_gpu);
+    CudaExpectedImprovementEvaluator::StateType cuda_ei_state(cuda_ei_evaluator, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, configure_for_gradients, &uniform_generator, configure_for_test);
+
+    ei_gpu = cuda_ei_evaluator.ComputeObjectiveFunction(&cuda_ei_state);
+
+    // setup cpu EI computation
+    normal_random_table = cuda_ei_state.random_number_ei;
+    cpu_num_iter = normal_random_table.size()/ (num_being_sampled + num_to_sample);
+    NormalRNGSimulator normal_rng_for_ei(normal_random_table);
+    ExpectedImprovementEvaluator ei_evaluator_for_ei(gaussian_process, cpu_num_iter, best_so_far);
+    ExpectedImprovementEvaluator::StateType ei_state_for_ei(ei_evaluator_for_ei, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, false, &normal_rng_for_ei);
+    ei_cpu = ei_evaluator_for_ei.ComputeObjectiveFunction(&ei_state_for_ei);
+
+    // setup cpu gradEI computation
+    cuda_ei_evaluator.ComputeGradObjectiveFunction(&cuda_ei_state, grad_ei_gpu.data());
+
+    normal_random_table = cuda_ei_state.random_number_grad_ei;
+    cpu_num_iter = normal_random_table.size()/ (num_being_sampled + num_to_sample);
+    NormalRNGSimulator normal_rng_for_grad_ei(normal_random_table);
+    ExpectedImprovementEvaluator ei_evaluator_for_grad_ei(gaussian_process, cpu_num_iter, best_so_far);
+    ExpectedImprovementEvaluator::StateType ei_state_for_grad_ei(ei_evaluator_for_grad_ei, ei_environment.points_to_sample(), ei_environment.points_being_sampled(), num_to_sample, num_being_sampled, true, &normal_rng_for_grad_ei);
+    ei_evaluator_for_grad_ei.ComputeGradObjectiveFunction(&ei_state_for_grad_ei, grad_ei_cpu.data());
+
+    int ei_errors_this_iteration = 0;
+    if (!CheckDoubleWithinRelative(ei_cpu, ei_gpu, 1.0e-12)) {
+      ++ei_errors_this_iteration;
+    }
+    if (ei_errors_this_iteration != 0) {
+      OL_PARTIAL_FAILURE_PRINTF("in EI on iteration %d\n", i);
+    }
+    total_errors += ei_errors_this_iteration;
+
+    int grad_ei_errors_this_iteration = 0;
+    for (int j = 0; j < dim*num_to_sample; ++j) {
+      if (!CheckDoubleWithinRelative(grad_ei_cpu[j], grad_ei_gpu[j], 1.0e-12)) {
+        ++grad_ei_errors_this_iteration;
+      }
+    }
+
+    if (grad_ei_errors_this_iteration != 0) {
+      OL_PARTIAL_FAILURE_PRINTF("in EI gradients on iteration %d\n", i);
+    }
+    total_errors += grad_ei_errors_this_iteration;
+  }
+
+  if (total_errors != 0) {
+    OL_PARTIAL_FAILURE_PRINTF("comparing cpu EI to gpu EI failed with %d total_errors\n\n", total_errors);
+  } else {
+    OL_PARTIAL_SUCCESS_PRINTF("comparing cpu EI to gpu EI passed\n");
+  }
+  return total_errors;
+}
+
+}  // end unnamed namespace
+
+/*!\rst
+  Invoke all tests for GPU functions.
+  \return
+    number of test failures: 0 if all is working well.
+\endrst*/
+int RunGPUTests() {
+  int total_errors = 0;
+  int error = RunCudaEIConsistencyTests();
+  if (error != 0) {
+    OL_FAILURE_PRINTF("analytic, Cuda EI do not match for 1 potential sample case\n");
+  } else {
+    OL_SUCCESS_PRINTF("analytic, Cuda EI match for 1 potential sample case\n");
+  }
+  total_errors += error;
+
+  error = RunCudaEIvsCpuEITests();
+  if (error != 0) {
+    OL_FAILURE_PRINTF("cudaEI vs cpuEI consistency check failed\n");
+  } else {
+    OL_SUCCESS_PRINTF("cudaEI vs cpuEI consistency check successed\n");
+  }
+  total_errors += error;
+  return total_errors;
+}
+
+#else  // OL_GPU_ENABLED
+
+int RunGPUTests() {
+  OL_WARNING_PRINTF("no gpu component is enabled, this test did not run.\n");
+  return 0;
+}
+
+#endif  // OL_GPU_ENABLED
+
+}  // end namespace optimal_learning
diff --git a/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.hpp b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.hpp
new file mode 100644
index 00000000..92b51bfa
--- /dev/null
+++ b/moe/optimal_learning/cpp/gpp_expected_improvement_gpu_test.hpp
@@ -0,0 +1,29 @@
+/*!
+  \file gpp_expected_improvement_gpu_test.hpp
+  \rst
+  Functions for testing expected improvement functions on GPU.
+
+  Tests are broken into two main groups:
+
+  * consistency test against analytical 1,0-EI result
+  * compare with CPU(MC) results
+  .. Note:: These tests do not run if GPU computation (``OL_GPU_ENABLED``) is disabled.
+
+\endrst*/
+
+#ifndef MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_TEST_HPP_
+#define MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_TEST_HPP_
+
+#include "gpp_common.hpp"
+
+namespace optimal_learning {
+/*!\rst
+  Invoke all tests for GPU functions.
+  \return
+    number of test failures: 0 if all is working well.
+\endrst*/
+OL_WARN_UNUSED_RESULT int RunGPUTests();
+
+}  // end namespace optimal_learning
+#endif  // MOE_OPTIMAL_LEARNING_CPP_GPP_EXPECTED_IMPROVEMENT_GPU_TEST_HPP_
+
diff --git a/moe/optimal_learning/cpp/gpp_python_test.cpp b/moe/optimal_learning/cpp/gpp_python_test.cpp
index d72c7f37..e9725f30 100644
--- a/moe/optimal_learning/cpp/gpp_python_test.cpp
+++ b/moe/optimal_learning/cpp/gpp_python_test.cpp
@@ -17,6 +17,7 @@
 #include "gpp_covariance_test.hpp"
 #include "gpp_domain.hpp"
 #include "gpp_domain_test.hpp"
+#include "gpp_expected_improvement_gpu_test.hpp"
 #include "gpp_geometry_test.hpp"
 #include "gpp_heuristic_expected_improvement_optimization_test.hpp"
 #include "gpp_linear_algebra_test.hpp"
@@ -75,6 +76,14 @@ int RunCppTestsWrapper() {
   }
   total_errors += error;
 
+  error = RunGPUTests();
+  if (error != 0) {
+    OL_FAILURE_PRINTF("GPU tests failed\n");
+  } else {
+    OL_SUCCESS_PRINTF("GPU tests passed\n");
+  }
+  total_errors += error;
+
   error = RunLogLikelihoodPingTests();
   if (error != 0) {
     OL_FAILURE_PRINTF("LogLikelihood ping tests failed\n");
diff --git a/moe/optimal_learning/cpp/gpp_random.cpp b/moe/optimal_learning/cpp/gpp_random.cpp
index a3d9f827..0af9d294 100644
--- a/moe/optimal_learning/cpp/gpp_random.cpp
+++ b/moe/optimal_learning/cpp/gpp_random.cpp
@@ -147,7 +147,7 @@ double NormalRNGSimulator::operator()() {
   int size_of_table = random_number_table_.size();
   if (index_ < size_of_table) {
     ++index_;
-    return random_number_table_[index_];
+    return random_number_table_[index_-1];
   } else {
     OL_THROW_EXCEPTION(InvalidValueException<int>, "All random numbers stored in the RNG have been used up!", index_, size_of_table);
   }
diff --git a/moe/optimal_learning/cpp/gpp_test_utils.cpp b/moe/optimal_learning/cpp/gpp_test_utils.cpp
index 35df2219..16dddf44 100644
--- a/moe/optimal_learning/cpp/gpp_test_utils.cpp
+++ b/moe/optimal_learning/cpp/gpp_test_utils.cpp
@@ -177,9 +177,9 @@ bool CheckDoubleWithin(double value, double truth, double tolerance) noexcept {
   return passed;
 }
 
-bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noexcept {
+bool CheckDoubleWithinRelativeWithThreshold(double value, double truth, double tolerance, double threshold) noexcept {
   double denom = std::fabs(truth);
-  if (denom < std::numeric_limits<double>::min()) {
+  if (denom < threshold) {
     denom = 1.0;  // don't divide by 0
   }
   double diff = std::fabs((value - truth)/denom);
@@ -187,10 +187,13 @@ bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noe
   if (passed != true) {
     OL_ERROR_PRINTF("value = %.18E, truth = %.18E, diff = %.18E, tol = %.18E\n", value, truth, diff, tolerance);
   }
-
   return passed;
 }
 
+bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noexcept {
+  return CheckDoubleWithinRelativeWithThreshold(value, truth, tolerance, std::numeric_limits<double>::min());
+}
+
 /*!\rst
   Uses the Frobenius Norm for convenience; matrix 2-norms are expensive to compute.
 \endrst*/
diff --git a/moe/optimal_learning/cpp/gpp_test_utils.hpp b/moe/optimal_learning/cpp/gpp_test_utils.hpp
index 6edc6524..be0258c7 100644
--- a/moe/optimal_learning/cpp/gpp_test_utils.hpp
+++ b/moe/optimal_learning/cpp/gpp_test_utils.hpp
@@ -428,6 +428,21 @@ bool CheckDoubleWithin(double value, double truth, double tolerance) noexcept OL
 \endrst*/
 bool CheckDoubleWithinRelative(double value, double truth, double tolerance) noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT;
 
+/*!\rst
+  Checks if ``|value - truth| / |truth| <= tolerance`` (relative error)
+
+  If truth < threshold, CheckDoubleWithin() is performed.
+
+  \param
+    :value: number to be tested
+    :truth: the exact/desired result
+    :tolerance: permissible relative difference
+    :threshold: tolerance = |value - truth| if |truth| < threshold, this is to control unexpected large or undefined relative diff when truth is "too small" (0 for example)
+  \return
+    true if value, truth differ relatively by no more than tolerance.
+\endrst*/
+bool CheckDoubleWithinRelativeWithThreshold(double value, double truth, double tolerance, double threshold) noexcept OL_PURE_FUNCTION OL_WARN_UNUSED_RESULT;
+
 /*!\rst
   Checks that ``||A - B||_F <= tolerance``
 
diff --git a/moe/optimal_learning/cpp/gpu/CMakeLists.txt b/moe/optimal_learning/cpp/gpu/CMakeLists.txt
new file mode 100644
index 00000000..d2c9888f
--- /dev/null
+++ b/moe/optimal_learning/cpp/gpu/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 2.8.9)
+find_package(CUDA 5.0 REQUIRED)
+CUDA_INCLUDE_DIRECTORIES(
+    ${CUDA_INCLUDE_DIRS}
+    ${MOE_CUDA_SDK_INCLUDE_DIRS}
+    )
+
+# Provide a list of actual .cu files
+set(CUDA_SRCS
+    gpp_cuda_math.cu
+    )
+
+# Compiler flags - eg specify different compute capabilites 
+list(APPEND CUDA_NVCC_FLAGS --gpu-architecture sm_20)
+
+CUDA_ADD_LIBRARY(
+    OL_GPU
+    ${CUDA_SRCS}
+    SHARED
+    )
diff --git a/moe/optimal_learning/cpp/gpu/gpp_cuda_math.cu b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.cu
new file mode 100644
index 00000000..bffb225b
--- /dev/null
+++ b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.cu
@@ -0,0 +1,394 @@
+/*!
+  \file gpp_cuda_math.cu
+  \rst
+  This file contains implementations of all GPU functions. There are both device code (executed on
+  GPU device) and host code (executed on CPU), and they are compiled by NVCC, which is a NVIDIA CUDA
+  compiler.
+\endrst*/
+
+#include "gpp_cuda_math.hpp"
+
+#include <stdint.h>
+#include <curand_kernel.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+#include <algorithm>
+
+/*!\rst
+  Macro to stringify the expansion of a macro. For example, say we are on line 53:
+
+  * ``#__LINE__ --> "__LINE__"``
+  * ``OL_CUDA_STRINGIFY_EXPANSION(__LINE__) --> "53"``
+
+  ``OL_CUDA_STRINGIFY_EXPANSION_INNER`` is not meant to be used directly;
+  but we need ``#x`` in a macro for this expansion to work.
+
+  This is a standard trick; see bottom of:
+  http://gcc.gnu.org/onlinedocs/cpp/Stringification.html
+\endrst*/
+#define OL_CUDA_STRINGIFY_EXPANSION_INNER(x) #x
+#define OL_CUDA_STRINGIFY_EXPANSION(x) OL_CUDA_STRINGIFY_EXPANSION_INNER(x)
+
+/*!\rst
+  Macro to stringify and format the current file and line number. For
+  example, if the macro is invoked from line 893 of file gpp_foo.cpp,
+  this macro produces the compile-time string-constant:
+  ``(gpp_foo.cpp: 893)``
+\endrst*/
+#define OL_CUDA_STRINGIFY_FILE_AND_LINE "(" __FILE__ ": " OL_CUDA_STRINGIFY_EXPANSION(__LINE__) ")"
+
+/*!\rst
+  Macro that checks error message (with type cudaError_t) returned by CUDA API functions, and if there is error occurred,
+  the macro produces a C struct containing error message, function name where error occured, file name and line info, and 
+  then terminate the function.
+\endrst*/
+#define OL_CUDA_ERROR_RETURN(X) do {cudaError_t _error_code = (X); if (_error_code != cudaSuccess) {CudaError _err = {_error_code, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__}; return _err;}} while (0)
+
+namespace optimal_learning {
+
+namespace {  // functions run on gpu device
+/*!\rst
+  Special case of GeneralMatrixVectorMultiply.  As long as A has zeros in the strict upper-triangle,
+  GeneralMatrixVectorMultiply will work too (but take ``>= 2x`` as long).
+
+  Computes results IN-PLACE.
+  Avoids accessing the strict upper triangle of A.
+
+  Should be equivalent to BLAS call:
+  ``dtrmv('L', trans, 'N', size_m, A, size_m, x, 1);``
+\endrst*/
+__device__ void CudaTriangularMatrixVectorMultiply(double const * __restrict__ A, int size_m, double * __restrict__ x) {
+  double temp;
+  A += size_m * (size_m-1);
+  for (int j = size_m-1; j >= 0; --j) {  // i.e., j >= 0
+    temp = x[j];
+    for (int i = size_m-1; i >= j+1; --i) {
+      // handles sub-diagonal contributions from j-th column
+      x[i] += temp*A[i];
+    }
+    x[j] *= A[j];  // handles j-th on-diagonal component
+    A -= size_m;
+  }
+}
+
+/*!\rst
+  This is reduced version of GeneralMatrixVectorMultiply(...) in gpp_linear_algebra.cpp, and this function computes
+  y = y - A * x (aka alpha = -1.0, beta = 1.0)
+\endrst*/
+__device__ void CudaGeneralMatrixVectorMultiply(double const * __restrict__ A, double const * __restrict__ x, int size_m, int size_n, int lda, double * __restrict__ y) {
+  double temp;
+  for (int i = 0; i < size_n; ++i) {
+    temp = -1.0 * x[i];
+    for (int j = 0; j < size_m; ++j) {
+      y[j] += A[j]*temp;
+    }
+    A += lda;
+  }
+}
+
+/*!\rst
+  This inline function copies [begin, begin+1, ..., end-1] elements from one array to the other, if bound < end, then end = bound
+\endrst*/
+__forceinline__ __device__ void CudaCopyElements(int begin, int end, int bound, double const * __restrict__ origin, double * __restrict__ destination) {
+    int local_end = end < bound ? end : bound;
+    for (int idx = begin; idx < local_end; ++idx) {
+        destination[idx] = origin[idx];
+    }
+}
+
+/*!\rst
+  Device code to compute Expected Improvement by Monte-Carlo on GPU
+  \param
+    :mu[num_union]: the mean of the GP evaluated at points interested
+    :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested
+    :num_union: number of the points interested
+    :num_iteration: number of iterations performed on each thread for MC evaluation
+    :best: best function evaluation obtained so far
+    :seed: seed for RNG
+    :ei_storage[num_threads][num_blocks]: array storing values of EI on GPU
+    :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: array storing random
+     numbers used for computing EI, for testing purpose only
+    :configure_for_test: whether record random_number_ei or not
+  \output
+    :ei_storage[num_threads][num_blocks]: each thread write result of computed EI to its corresponding position
+    :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: write random numbers
+     used for computing EI into the array, for testing purpose only
+\endrst*/
+__global__ void CudaComputeEIGpu(double const * __restrict__ mu, double const * __restrict__ chol_var,
+                                 int num_union, int num_iteration, double best, uint64_t seed,
+                                 double * __restrict__ ei_storage, double* __restrict__ gpu_random_number_ei,
+                                 bool configure_for_test) {
+  // copy mu, chol_var to shared memory mu_local & chol_var_local
+  // For multiple dynamically sized arrays in a single kernel, declare a single extern unsized array, and use
+  // pointers into it to divide it into multiple arrays
+  // refer to http://devblogs.nvidia.com/parallelforall/using-shared-memory-cuda-cc/
+  extern __shared__ double storage[];
+  double * chol_var_local = storage;
+  double * mu_local = chol_var_local + num_union * num_union;
+  const int idx = threadIdx.x;
+  const int IDX = threadIdx.x + blockDim.x * blockIdx.x;
+  int chunk_size = (num_union * num_union - 1)/ blockDim.x + 1;
+  CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union * num_union, chol_var, chol_var_local);
+  chunk_size = (num_union - 1)/ blockDim.x + 1;
+  CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union,  mu, mu_local);
+  __syncthreads();
+
+  // MC start
+  // RNG setup
+  uint64_t local_seed = seed + IDX;
+  curandState random_state;
+  // seed a random number generator
+  curand_init(local_seed, 0, 0, &random_state);
+
+  double *normals = reinterpret_cast<double *>(malloc(sizeof(*chol_var_local) * num_union));
+  double agg = 0.0;
+  double improvement_this_step;
+  double EI;
+
+  for (int mc = 0; mc < num_iteration; ++mc) {
+    improvement_this_step = 0.0;
+    for (int i = 0; i < num_union; ++i) {
+        normals[i] = curand_normal_double(&random_state);
+        // If configure_for_test is true, random numbers used in MC computations will be saved as output.
+        // In fact we will let EI compuation on CPU use the same sequence of random numbers saved here,
+        // so that EI compuation on CPU & GPU can be compared directly for unit test purpose.
+        if (configure_for_test) {
+            gpu_random_number_ei[IDX * num_iteration * num_union + mc * num_union + i] = normals[i];
+        }
+    }
+    CudaTriangularMatrixVectorMultiply(chol_var_local, num_union, normals);
+    for (int i = 0; i < num_union; ++i) {
+        EI = best - (mu_local[i] + normals[i]);
+        improvement_this_step = fmax(EI, improvement_this_step);
+    }
+    agg += improvement_this_step;
+  }
+  ei_storage[IDX] = agg / static_cast<double>(num_iteration);
+  free(normals);
+}
+
+/*!\rst
+  Device code to compute Gradient of Expected Improvement by Monte-Carlo on GPU
+  \param
+    :mu[num_union]: the mean of the GP evaluated at points interested
+    :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested
+    :grad_mu[dim][num_to_sample]: the gradient of mean of the GP evaluated at points interested
+    :grad_chol_var[dim][num_union][num_union][num_to_sample]: gradient of cholesky factorization of the GP variance
+     evaluated at points interested
+    :num_union: number of the union of points (aka q+p)
+    :num_to_sample: number of points to sample (aka q)
+    :dim: dimension of point space
+    :num_iteration: number of iterations performed on each thread for MC evaluation
+    :best: best function evaluation obtained so far
+    :seed: seed for RNG
+    :grad_ei_storage[dim][num_to_sample][num_threads][num_blocks]: A vector storing result of grad_ei from each thread
+    :gpu_random_number_grad_ei[num_union][num_itreration][num_threads][num_blocks]: array storing
+     random numbers used for computing gradEI, for testing purpose only
+    :configure_for_test: whether record random_number_grad_ei or not
+  \output
+    :grad_ei_storage[dim][num_to_sample][num_threads][num_blocks]: each thread write result of grad_ei
+     to its corresponding positions
+    :gpu_random_number_grad_ei[num_union][num_iteration][num_threads][num_blocks]: write random numbers
+     used for computing gradEI to the array, for testing purpose only
+\endrst*/
+__global__ void CudaComputeGradEIGpu(double const * __restrict__ mu, double const * __restrict__ chol_var,
+                                     double const * __restrict__ grad_mu, double const * __restrict__ grad_chol_var,
+                                     int num_union, int num_to_sample, int dim, int num_iteration, double best,
+                                     uint64_t seed,  double * __restrict__ grad_ei_storage,
+                                     double* __restrict__ gpu_random_number_grad_ei, bool configure_for_test) {
+  // copy mu, chol_var, grad_mu, grad_chol_var to shared memory
+  extern __shared__ double storage[];
+  double * mu_local = storage;
+  double * chol_var_local = mu_local + num_union;
+  double * grad_mu_local = chol_var_local + num_union * num_union;
+  double * grad_chol_var_local = grad_mu_local + num_to_sample * dim;
+  const int idx = threadIdx.x;
+  const int IDX = threadIdx.x + blockDim.x * blockIdx.x;
+  int chunk_size = (num_to_sample * num_union * num_union * dim - 1)/ blockDim.x + 1;
+  CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_to_sample * num_union * num_union * dim,
+                  grad_chol_var, grad_chol_var_local);
+  chunk_size = (num_union * num_union - 1)/ blockDim.x + 1;
+  CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union * num_union, chol_var, chol_var_local);
+  chunk_size = (num_to_sample * dim - 1)/ blockDim.x + 1;
+  CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_to_sample * dim, grad_mu, grad_mu_local);
+  chunk_size = (num_union - 1)/ blockDim.x + 1;
+  CudaCopyElements(chunk_size * idx, chunk_size * (idx + 1), num_union, mu, mu_local);
+  __syncthreads();
+
+  int i, k, mc, winner;
+  double EI, improvement_this_step;
+  // RNG setup
+  uint64_t local_seed = seed + IDX;
+  curandState random_state;
+  curand_init(local_seed, 0, 0, &random_state);
+  double* normals = reinterpret_cast<double*>(malloc(sizeof(*mu_local) * num_union));
+  double* normals_copy = reinterpret_cast<double*>(malloc(sizeof(*mu_local) * num_union));
+  // initialize grad_ei_storage
+  for (int i = 0; i < (num_to_sample * dim); ++i) {
+      grad_ei_storage[IDX*num_to_sample*dim + i] = 0.0;
+  }
+  // MC step start
+  for (mc = 0; mc < num_iteration; ++mc) {
+      improvement_this_step = 0.0;
+      winner = -1;
+      for (i = 0; i < num_union; ++i) {
+          normals[i] = curand_normal_double(&random_state);
+          normals_copy[i] = normals[i];
+            // If configure_for_test is true, random numbers used in MC computations will be saved as output.
+            // In fact we will let grad_ei compuation on CPU use the same sequence of random numbers saved here,
+            // so that grad_ei compuation on CPU & GPU can be compared directly for unit test purpose.
+          if (configure_for_test) {
+              gpu_random_number_grad_ei[IDX * num_iteration * num_union + mc * num_union + i] = normals[i];
+          }
+      }
+      CudaTriangularMatrixVectorMultiply(chol_var_local, num_union, normals);
+      for (i = 0; i < num_union; ++i) {
+          EI = best - (mu_local[i] + normals[i]);
+          if (EI > improvement_this_step) {
+              improvement_this_step = EI;
+              winner = i;
+          }
+      }
+      if (improvement_this_step > 0.0) {
+          if (winner < num_to_sample) {
+              for (k = 0; k < dim; ++k) {
+                  grad_ei_storage[IDX*num_to_sample*dim + winner * dim + k] -= grad_mu_local[winner * dim + k];
+              }
+          }
+          for (i = 0; i < num_to_sample; ++i) {   // derivative w.r.t ith point
+              CudaGeneralMatrixVectorMultiply(grad_chol_var_local + i * num_union * num_union * dim +
+                                              winner * num_union * dim, normals_copy, dim, num_union,
+                                              dim, grad_ei_storage + IDX * num_to_sample * dim + i * dim);
+          }
+      }
+  }
+
+  for (int i = 0; i < num_to_sample*dim; ++i) {
+      grad_ei_storage[IDX*num_to_sample*dim + i] /= static_cast<double>(num_iteration);
+  }
+  free(normals);
+  free(normals_copy);
+}
+
+}  // end unnamed namespace
+
+CudaError CudaAllocateMemForDoubleVector(int num_doubles, double** __restrict__ address_of_ptr_to_gpu_memory) {
+  CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__};
+  int mem_size = num_doubles * sizeof(**address_of_ptr_to_gpu_memory);
+  OL_CUDA_ERROR_RETURN(cudaMalloc(reinterpret_cast<void**>(address_of_ptr_to_gpu_memory), mem_size));
+  return _success;
+}
+
+void CudaFreeMem(double* __restrict__ ptr_to_gpu_memory) {
+  cudaFree(ptr_to_gpu_memory);
+}
+
+CudaError CudaGetEI(double * __restrict__ mu, double * __restrict__ chol_var, int num_union, int num_mc,
+                    uint64_t seed, double best, bool configure_for_test, double * __restrict__ random_number_ei,
+                    double * __restrict__ ei_val, double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var,
+                    double* __restrict__ gpu_random_number_ei, double * __restrict__ gpu_ei_storage) {
+  *ei_val = 0.0;
+  CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__};
+
+  // We assign kEINumBlocks blocks and kEINumThreads threads/block for EI computation, so there are
+  // (kEINumBlocks * kEINumThreads) threads in total to execute kernel function in parallel
+  dim3 threads(kEINumThreads);
+  dim3 grid(kEINumBlocks);
+  double ei_storage[kEINumThreads * kEINumBlocks];
+  int num_iteration = num_mc / (kEINumThreads * kEINumBlocks) + 1;   // make sure num_iteration is always >= 1
+
+  int mem_size_mu = num_union * sizeof(*mu);
+  int mem_size_chol_var = num_union * num_union * sizeof(*mu);
+  int mem_size_ei_storage = kEINumThreads * kEINumBlocks * sizeof(*mu);
+  // copy mu, chol_var to GPU
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_mu, mu, mem_size_mu, cudaMemcpyHostToDevice));
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_chol_var, chol_var, mem_size_chol_var, cudaMemcpyHostToDevice));
+  // execute kernel
+  CudaComputeEIGpu <<< grid, threads, num_union*sizeof(*mu)+num_union*num_union*sizeof(*mu) >>>
+                   (gpu_mu, gpu_chol_var, num_union, num_iteration, best, seed, gpu_ei_storage,
+                    gpu_random_number_ei, configure_for_test);
+  OL_CUDA_ERROR_RETURN(cudaPeekAtLastError());
+  // copy gpu_ei_storage back to CPU
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(ei_storage, gpu_ei_storage, mem_size_ei_storage, cudaMemcpyDeviceToHost));
+  // copy gpu_random_number_ei back to CPU if configure_for_test is on
+  if (configure_for_test) {
+      int mem_size_random_number_ei = num_iteration * kEINumThreads * kEINumBlocks * num_union * sizeof(*mu);
+      OL_CUDA_ERROR_RETURN(cudaMemcpy(random_number_ei, gpu_random_number_ei, mem_size_random_number_ei, cudaMemcpyDeviceToHost));
+  }
+  // average ei_storage
+  double ave = 0.0;
+  for (int i = 0; i < (kEINumThreads*kEINumBlocks); ++i) {
+      ave += ei_storage[i];
+  }
+  *ei_val = ave / static_cast<double>(kEINumThreads*kEINumBlocks);
+  return _success;
+}
+
+CudaError CudaGetGradEI(double * __restrict__ mu, double * __restrict__ chol_var, double * __restrict__ grad_mu,
+                        double * __restrict__ grad_chol_var, int num_union, int num_to_sample, int dim, int num_mc,
+                        uint64_t seed, double best, bool configure_for_test, double* __restrict__ random_number_grad_ei,
+                        double * __restrict__ grad_ei, double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var,
+                        double * __restrict__ gpu_grad_mu, double * __restrict__ gpu_grad_chol_var,
+                        double* __restrict__ gpu_random_number_grad_ei, double * __restrict__ gpu_grad_ei_storage) {
+  CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__};
+
+  double grad_ei_storage[num_to_sample * dim * kGradEINumThreads * kGradEINumBlocks];
+  std::fill(grad_ei, grad_ei + num_to_sample * dim, 0.0);
+
+  // We assign kGradEINumBlocks blocks and kGradEINumThreads threads/block for grad_ei computation,
+  // so there are (kGradEINumBlocks * kGradEINumThreads) threads in total to execute kernel function
+  // in parallel
+  dim3 threads(kGradEINumThreads);
+  dim3 grid(kGradEINumBlocks);
+  int num_iteration = num_mc / (kGradEINumThreads * kGradEINumBlocks) + 1;   // make sure num_iteration is always >= 1
+
+  int mem_size_mu = num_union * sizeof(*mu);
+  int mem_size_grad_mu = num_to_sample * dim * sizeof(*mu);
+  int mem_size_chol_var = num_union * num_union *sizeof(*mu);
+  int mem_size_grad_chol_var = num_to_sample * num_union * num_union * dim * sizeof(*mu);
+  int mem_size_grad_ei_storage= kGradEINumThreads * kGradEINumBlocks * num_to_sample * dim * sizeof(*mu);
+
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_mu, mu, mem_size_mu, cudaMemcpyHostToDevice));
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_grad_mu, grad_mu, mem_size_grad_mu, cudaMemcpyHostToDevice));
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_chol_var, chol_var, mem_size_chol_var, cudaMemcpyHostToDevice));
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(gpu_grad_chol_var, grad_chol_var, mem_size_grad_chol_var, cudaMemcpyHostToDevice));
+
+  // execute kernel
+  // inputs: gpu_mu, gpu_chol_var, gpu_grad_mu, gpu_grad_chol_var, best, num_union, num_to_sample, dim, num_iteration, seed
+  // output: gpu_grad_ei_storage
+  CudaComputeGradEIGpu <<< grid, threads, mem_size_mu+mem_size_chol_var+mem_size_grad_mu+mem_size_grad_chol_var >>>
+                       (gpu_mu, gpu_chol_var, gpu_grad_mu, gpu_grad_chol_var, num_union, num_to_sample, dim,
+                        num_iteration, best, seed, gpu_grad_ei_storage, gpu_random_number_grad_ei, configure_for_test);
+  OL_CUDA_ERROR_RETURN(cudaPeekAtLastError());
+
+  OL_CUDA_ERROR_RETURN(cudaMemcpy(grad_ei_storage, gpu_grad_ei_storage, mem_size_grad_ei_storage, cudaMemcpyDeviceToHost));
+  // copy gpu_random_number_grad_ei back to CPU if configure_for_test is on
+  if (configure_for_test) {
+      int mem_size_random_number_grad_ei = num_iteration * kGradEINumThreads * kGradEINumBlocks * num_union * sizeof(*mu);
+      OL_CUDA_ERROR_RETURN(cudaMemcpy(random_number_grad_ei, gpu_random_number_grad_ei, mem_size_random_number_grad_ei, cudaMemcpyDeviceToHost));
+  }
+
+  // The code block below extracts grad_ei from grad_ei_storage, which is output from the function
+  // "CudaGetGradEI" run on gpu. The way to do that is for each component of grad_ei, we find all
+  // the threads calculating the corresponding component and average over the threads.
+  for (int n = 0; n < (kGradEINumThreads*kGradEINumBlocks); ++n) {
+      for (int i = 0; i < num_to_sample*dim; ++i) {
+          grad_ei[i] += grad_ei_storage[n*num_to_sample*dim + i];
+      }
+  }
+  for (int i = 0; i < num_to_sample*dim; ++i) {
+      grad_ei[i] /= static_cast<double>(kGradEINumThreads*kGradEINumBlocks);
+  }
+  return _success;
+}
+
+CudaError CudaSetDevice(int devID) {
+  CudaError _success = {cudaSuccess, OL_CUDA_STRINGIFY_FILE_AND_LINE, __func__};
+  OL_CUDA_ERROR_RETURN(cudaSetDevice(devID));
+  return _success;
+}
+
+}    // end namespace optimal_learning
+
diff --git a/moe/optimal_learning/cpp/gpu/gpp_cuda_math.hpp b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.hpp
new file mode 100644
index 00000000..8561e584
--- /dev/null
+++ b/moe/optimal_learning/cpp/gpu/gpp_cuda_math.hpp
@@ -0,0 +1,139 @@
+/*!\rst
+  \file gpp_cuda_math.hpp
+  \rst
+  This file contains declaration of gpu functions (host code) that are called by C++ code. The functions include calculating ExpectedImprovement, gradient of ExpectedImprovement, and gpu utility functions (memory allocation, setup gpu device, etc)
+\endrst*/
+
+#ifndef MOE_OPTIMAL_LEARNING_CPP_GPU_GPP_CUDA_MATH_HPP_
+#define MOE_OPTIMAL_LEARNING_CPP_GPU_GPP_CUDA_MATH_HPP_
+
+#include <stdint.h>
+
+#include "driver_types.h"
+
+namespace optimal_learning {
+
+//! Number of blocks assigned for computing Expected Improvement on GPU
+static unsigned int kEINumBlocks = 32;
+//! Number of threads per block assigned for computing Expected Improvement on GPU
+static unsigned int kEINumThreads = 256;
+//! Number of blocks assigned for computing Gradient of Expected Improvement on GPU
+static unsigned int kGradEINumBlocks = 32;
+//! Number of threads per block assigned for computing Gradient of Expected Improvement on GPU
+static unsigned int kGradEINumThreads = 256;
+
+/*!\rst
+  This C struct contains error information that are used by exception handling in gpp_expected_improvement_gpu.hpp/cpp
+\endrst*/
+struct CudaError {
+  //! error returned by CUDA API functions(basiclly enum type)
+  cudaError_t err;
+  //! file and line info of the function which returned error
+  char const * file_and_line_info;
+  //! name of the function that returned error
+  char const * func_info;
+};
+
+/*!\rst
+  Compute Expected Improvement by Monte-Carlo using GPU, and this function is only meant to be used by 
+  CudaExpectedImprovementEvaluator::ComputeExpectedImprovement(...) in gpp_expected_improvement_gpu.hpp/cpp
+  \param
+    :mu[num_union]: the mean of the GP evaluated at points interested
+    :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested
+    :num_union: number of the points interested
+    :num_mc: number of iterations for Monte-Carlo simulation
+    :seed: seed for RNG
+    :best: best function evaluation obtained so far
+    :configure_for_test: whether record random_number_ei or not
+    :random_number_ei[num_union][num_iteration][num_threads][num_blocks]: random numbers used for
+     computing EI, for testing purpose only
+    :ei_val[1]: pointer to value of Expected Improvement
+    :gpu_mu[num_union]: pointer to memory storing mu on GPU
+    :gpu_chol_var[num_union][num_union]: pointer to memory storing chol_var on GPU
+    :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: pointer to memory storing
+     random numbers used for computing EI, for testing purpose only
+    :gpu_ei_storage[num_threads][num_blocks]: pointer to memory storing values of EI on GPU
+  \output
+    :ei_val[1]: value of Expected Improvement modified, and equals to computed value of EI
+    :gpu_random_number_ei[num_union][num_iteration][num_threads][num_blocks]: pointer to memory storing
+     random numbers used for computing EI, for testing purpose only
+    :random_number_ei[num_union][num_iteration][num_threads][num_blocks]: random numbers used for
+     computing EI, for testing purpose only
+  \return
+    CudaError state, which contains error information, file name, line and function name of the function that occurs error
+\endrst*/
+extern "C" CudaError CudaGetEI(double * __restrict__ mu, double * __restrict__ chol_var, int num_union,
+                               int num_mc, uint64_t seed, double best, bool configure_for_test,
+                               double* __restrict__ random_number_ei, double* __restrict__ ei_val,
+                               double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var,
+                               double* __restrict__ gpu_random_number_ei, double * __restrict__ gpu_ei_storage);
+
+/*!\rst
+  Compute Gradient of Expected Improvement by Monte-Carlo using GPU, and this function is only meant to be used by 
+  CudaExpectedImprovementEvaluator::ComputeGradExpectedImprovement(...) in gpp_expected_improvement_gpu.hpp/cpp
+  \param
+    :mu[num_union]: the mean of the GP evaluated at points interested
+    :chol_var[num_union][num_union]: cholesky factorization of the GP variance evaluated at points interested
+    :grad_mu[dim][num_to_sample]: the gradient of mean of the GP evaluated at points interested
+    :grad_chol_var[dim][num_union][num_union][num_to_sample]: gradient of cholesky factorization of
+     the GP variance evaluated at points interested
+    :num_union: number of the union of points (aka q+p)
+    :num_to_sample: number of points to sample (aka q)
+    :dim: dimension of point space
+    :num_mc: number of iterations for Monte-Carlo simulation
+    :seed: seed for RNG
+    :best: best function evaluation obtained so far
+    :configure_for_test: whether record random_number_grad_ei or not
+    :random_number_grad_ei[num_union][num_threads][num_blocks]: random numbers used for computing gradEI,
+     for testing purpose only
+    :grad_ei[dim][num_to_sample]: pointer to value of gradient of Expected Improvement
+    :gpu_mu[num_union]: pointer to memory storing mu on GPU
+    :gpu_chol_var[num_union][num_union]: pointer to memory storing chol_var on GPU
+    :gpu_grad_mu[dim][num_to_sample]: pointer to memory storing grad_mu on GPU
+    :gpu_grad_chol_var[dim][num_union][num_union][num_to_sample]: pointer to memory storing grad_chol_var on GPU
+    :gpu_random_number_grad_ei[num_union][num_threads][num_blocks]: pointer to memory storing random
+     numbers used for computing gradEI, for testing purpose only
+    :gpu_grad_ei_storage[dim][num_to_sample][num_threads][num_blocks]: pointer to memory storing values of gradient EI on GPU
+  \output
+    :random_number_grad_ei[num_union][num_threads][num_blocks]: random numbers used for computing gradEI, for testing purpose only
+    :grad_ei[dim][num_to_sample]: pointer to value of gradient of Expected Improvement
+    :gpu_random_number_grad_ei[num_union][num_threads][num_blocks]: pointer to memory storing random
+     numbers used for computing gradEI, for testing purpose only
+  \return
+    CudaError state, which contains error information, file name, line and function name of the function that occurs error
+\endrst*/
+extern "C" CudaError CudaGetGradEI(double * __restrict__ mu, double * __restrict__ chol_var, double * __restrict__ grad_mu,
+                                   double * __restrict__ grad_chol_var, int num_union, int num_to_sample, int dim, int num_mc,
+                                   uint64_t seed, double best, bool configure_for_test, double* __restrict__ random_number_grad_ei,
+                                   double * __restrict__ grad_ei, double * __restrict__ gpu_mu, double * __restrict__ gpu_chol_var,
+                                   double * __restrict__ gpu_grad_mu, double * __restrict__ gpu_grad_chol_var,
+                                   double* __restrict__ gpu_random_number_grad_ei, double * __restrict__ gpu_grad_ei_storage);
+
+/*!\rst
+  Allocate GPU memory for storing an array. This is same as malloc in C, with error handling.
+  \param
+    :num_doubles: number of double numbers contained in the array
+    :address_of_ptr_to_gpu_memory: address of the pointer to memory on GPU
+  \return
+    CudaError state, which contains error information, file name, line and function name of the function that occurs error
+\endrst*/
+extern "C" CudaError CudaAllocateMemForDoubleVector(int num_doubles, double** __restrict__ address_of_ptr_to_gpu_memory);
+
+/*!\rst
+  Free GPU memory, same as free() in C.
+  \param
+    :ptr_to_gpu_memory: pointer to memory on GPU to free
+\endrst*/
+extern "C" void CudaFreeMem(double* __restrict__ ptr_to_gpu_memory);
+
+/*!\rst
+  Setup GPU device, and all GPU function calls will be operated on the GPU activated by this function.
+  \param
+    :devID: the ID of GPU device to setup
+  \return
+    CudaError state, which contains error information, file name, line and function name of the function that occurs error
+\endrst*/
+extern "C" CudaError CudaSetDevice(int devID);
+
+}   // end namespace optimal_learning
+#endif  // MOE_OPTIMAL_LEARNING_CPP_GPU_GPP_CUDA_MATH_HPP_