diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 529da36c82cd..8d190597ab0b 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -289,7 +289,6 @@ void PushFCompute(const FCompute& fn,
                       requested};
       if (ctx.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
-#if __CUDACC__
         GetDefaultBlobs<gpu>(ndinputs, &input_blobs, &temp_in, opctx);
         GetDefaultBlobs<gpu>(ndoutputs, &output_blobs, &temp_out, opctx);
         std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
@@ -297,7 +296,6 @@ void PushFCompute(const FCompute& fn,
         // cast to original storage type, if necessary
         CastNonDefaultStorage<gpu>(ndoutputs, temp_out, opctx);
         rctx.get_stream<gpu>()->Wait();
-#endif  // __CUDACC__
 #else
         LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
diff --git a/src/common/utils.cc b/src/common/utils.cc
new file mode 100644
index 000000000000..5bfb959fdf34
--- /dev/null
+++ b/src/common/utils.cc
@@ -0,0 +1,23 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file utils.cc
+ * \brief cpu implementation of util functions
+ */
+
+#include "./utils.h"
+#include "../operator/nn/cast_storage-inl.h"
+
+namespace mxnet {
+namespace common {
+
+
+template<>
+void CastStorageDispatch<cpu>(mshadow::Stream<cpu>* s,
+                              const NDArray& input,
+                              const NDArray& output) {
+  mxnet::op::CastStorageComputeImpl(s, input, output);
+}
+
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/utils.cu b/src/common/utils.cu
new file mode 100644
index 000000000000..a249be5bb9f5
--- /dev/null
+++ b/src/common/utils.cu
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file utils.cu
+ * \brief gpu implementation of util functions
+ */
+
+#include "./utils.h"
+#include "../operator/nn/cast_storage-inl.h"
+
+namespace mxnet {
+namespace common {
+
+template<>
+void CastStorageDispatch<gpu>(mshadow::Stream<gpu>* s,
+                              const NDArray& input,
+                              const NDArray& output) {
+  mxnet::op::CastStorageComputeImpl(s, input, output);
+}
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/utils.h b/src/common/utils.h
index 803c3af486c7..e87e42497b59 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -6,7 +6,12 @@
 #ifndef MXNET_COMMON_UTILS_H_
 #define MXNET_COMMON_UTILS_H_
 
-#if DMLC_USE_CXX11
+#include <dmlc/logging.h>
+#include <mxnet/engine.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/op_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+
 #include <memory>
 #include <vector>
 #include <type_traits>
@@ -15,20 +20,14 @@
 #include <thread>
 #include <algorithm>
 #include <functional>
-#endif  // DMLC_USE_CXX11
-
-#include <dmlc/logging.h>
-#include <mxnet/engine.h>
-#include <mxnet/ndarray.h>
-#include <mxnet/op_attr_types.h>
-#include <nnvm/graph_attr_types.h>
-#include "../operator/nn/cast_storage-inl.h"
 
 namespace mxnet {
 
 namespace common {
 
-#if DMLC_USE_CXX11
+template<typename xpu>
+void CastStorageDispatch(mshadow::Stream<xpu>* s, const NDArray& input, const NDArray& output);
+
 /*
  * \brief Get the corresponding tensor blobs from default storage NDArrays.
  *        If any NDArray is of non-default storage, it is casted to default storage and
@@ -54,7 +53,7 @@ inline bool GetDefaultBlobs(const std::vector<NDArray>& nds,
                    << "doesn't support NDArray inputs with non-default storage.";
       }
       NDArray temp(nd.shape(), nd.ctx(), false);
-      mxnet::op::CastStorageComputeImpl<xpu>(ctx.get_stream<xpu>(), nd, temp);
+      CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), nd, temp);
       temps->push_back(temp);
       blobs->push_back(temp.data());
       casted = true;
@@ -65,14 +64,6 @@ inline bool GetDefaultBlobs(const std::vector<NDArray>& nds,
   return casted;
 }
 
-template <typename xpu>
-inline void GetOutputBlobs(const std::vector<NDArray>& nds,
-                           std::vector<TBlob> *blobs) {
-  for (auto& nd : nds) {
-    blobs->push_back(nd.data());
-  }
-}
-
 /*
  * \brief Cast the NDArrays in `src` according to the storage types of the NDArrays
  *        in `dst`. The ones with default storage in `dst` are ignored.
@@ -98,7 +89,7 @@ inline void CastNonDefaultStorage(const std::vector<NDArray>& dst,
                    << "You are probably executing an operator which "
                    << "doesn't support NDArray inputs with non-default storage.";
       }
-      mxnet::op::CastStorageComputeImpl(ctx.get_stream<xpu>(), src[src_idx++], dst[i]);
+      CastStorageDispatch<xpu>(ctx.get_stream<xpu>(), src[src_idx++], dst[i]);
     }
   }
   CHECK_EQ(src_idx, src.size()) << "Not all src NDArrays are casted";
@@ -299,8 +290,6 @@ typename helper::UniqueIf<T>::UnknownBound MakeUnique(size_t n) {
 template <class T, class... Args>
 typename helper::UniqueIf<T>::KnownBound MakeUnique(Args&&... args) = delete;
 
-#endif  // DMLC_USE_CXX11
-
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 3e037a98b736..981fab2a4686 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -40,13 +40,11 @@ class ForwardOpExecutor : public OpExecutor {
     temp_in_.clear(); temp_out_.clear(); temp_aux_.clear();
     if (is_gpu) {
 #if MXNET_USE_CUDA
-#if __CUDACC__
       GetDefaultBlobs<gpu>(in_array_, &in_data_, &temp_in_, op_ctx);
       GetDefaultBlobs<gpu>(aux_array_, &aux_data_, &temp_aux_, op_ctx);
       GetDefaultBlobs<gpu>(out_array, &out_data_, &temp_out_, op_ctx);
       op_->Forward(op_ctx, in_data_, req, out_data_, aux_data_);
       CastNonDefaultStorage<gpu>(out_array, temp_out_, op_ctx);
-#endif  // __CUDACC__
 #elif NDEBUG == 0
       LOG(DEBUG) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
@@ -173,12 +171,10 @@ class FComputeExecutor : public OpExecutor {
       temp_in_.clear(); temp_out_.clear();
       if (is_gpu) {
 #if MXNET_USE_CUDA
-#if __CUDACC__
         GetDefaultBlobs<gpu>(in_array, &in_data_, &temp_in_, op_ctx);
         GetDefaultBlobs<gpu>(out_array, &out_data_, &temp_out_, op_ctx);
         fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
         CastNonDefaultStorage<gpu>(out_array, temp_out_, op_ctx);
-#endif  // __CUDACC__
 #else
         LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 22cb70e275a2..c894f27c25b7 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -12,6 +12,7 @@
 #include <mxnet/resource.h>
 #include <mshadow/tensor.h>
 #include "./ndarray_function.h"
+#include "../common/utils.h"
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tensor/init_op.h"
 #include "./autograd.h"
@@ -466,7 +467,7 @@ void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx) {
     } else {
       casted_nd = NDArray(to_stype, shape, from_ctx);
     }
-    op::CastStorageComputeImpl<from_xpu>(s, from, casted_nd);
+    common::CastStorageDispatch<from_xpu>(s, from, casted_nd);
   } else {
     casted_nd = from;
   }
@@ -510,7 +511,6 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
       FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU"));
   } else {
 #if MXNET_USE_CUDA
-#if __CUDACC__
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushSync([from, ret](RunContext ctx) {
           NDArray nd(ret);
@@ -533,7 +533,6 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
     } else {
       LOG(FATAL) << "unknown device mask";
     }
-#endif  // __CUDACC__
 #else
     LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 6a9ee30f1b04..d4a473c8be0c 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -211,7 +211,6 @@ __global__ void mxnet_generic_kernel(int N, Args... args) {
   }
 }
 
-
 template<typename OP>
 struct Kernel<OP, gpu> {
   template<typename ...Args>
diff --git a/src/operator/nn/cast_storage-inl.h b/src/operator/nn/cast_storage-inl.h
index 921003453e1a..b60665c73fe9 100644
--- a/src/operator/nn/cast_storage-inl.h
+++ b/src/operator/nn/cast_storage-inl.h
@@ -10,7 +10,7 @@
 #include "../mxnet_op.h"
 #ifdef __CUDACC__
 #include "./cast_storage-inl.cuh"
-#endif
+#endif  // __CUDACC__
 
 
 namespace mxnet {