ROCm
diff --git a/‎tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc‎
Lines changed: 7 additions & 9 deletions b/‎tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/compiler/xla/service/gpu/nvptx_executable.cc‎
Lines changed: 2 additions & 2 deletions b/‎tensorflow/compiler/xla/service/gpu/nvptx_executable.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc‎
Lines changed: 3 additions & 3 deletions b/‎tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc‎
Lines changed: 6 additions & 6 deletions b/‎tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc‎
Lines changed: 9 additions & 9 deletions b/‎tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc‎
Lines changed: 3 additions & 3 deletions b/‎tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc‎
Lines changed: 3 additions & 3 deletions b/‎tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc‎
Lines changed: 1 addition & 1 deletion b/‎tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc‎
Lines changed: 1 addition & 1 deletion
@@ -22,10 +22,10 @@ limitations under the License.
 #include "llvm/IR/DataLayout.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 // XXX figure out how to cope with both platforms
-#if GOOGLE_CUDA
-#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
-#elif TENSORFLOW_USE_ROCM
+#if TENSORFLOW_USE_ROCM
 #include "tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h"
+#else
+#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
 #endif
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -46,13 +46,11 @@ namespace xla {
 GpuTransferManager::GpuTransferManager(se::Platform::Id id)
     : GenericTransferManager(
           id,
-// XXX figure out how to cope with both platforms
-#if GOOGLE_CUDA
-          /*pointer_size=*/llvm::DataLayout(gpu::NVPTXCompiler::kDataLayout)
-#elif TENSORFLOW_USE_ROCM
-          /*pointer_size=*/llvm::DataLayout(gpu::AMDGPUCompiler::kDataLayout)
+#if TENSORFLOW_USE_ROCM
+          llvm::DataLayout(gpu::AMDGPUCompiler::kDataLayout).getPointerSize(0)){}
+#else
+          llvm::DataLayout(gpu::NVPTXCompiler::kDataLayout).getPointerSize(0)){}
 #endif
-              .getPointerSize(0 /* default address space */)) {}
 
 Status GpuTransferManager::TransferLiteralToInfeed(
     se::StreamExecutor* executor, const LiteralSlice& literal) {
 
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h"
 
 #include <map>
 #include <memory>
 
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/service/gpu/nvptx_executable.h"
 
 #include <set>
 #include <utility>
@@ -45,7 +45,7 @@ NVPTXExecutable::NVPTXExecutable(
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-    : GpuExecutable(std::move(text), std::move(think_schedule),
+    : GpuExecutable(std::move(text), std::move(thunk_schedule),
                     std::move(hlo_module), std::move(assignment),
                     std::move(hlo_profile_printer_data),
                     std::move(hlo_profile_index_map)),
 
@@ -516,7 +516,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
   auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(),
                                  bias.template flat<BiasType>().size());
 
-  static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
+  static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
       "TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32  // 4GB
   );
@@ -551,7 +551,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
     for (auto profile_algorithm : algorithms) {
       // TODO(zhengxq): profile each algorithm multiple times to better
       // accuracy.
-      CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+      DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
       dnn::ProfileResult profile_result;
       bool cudnn_launch_status =
           stream
@@ -591,7 +591,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                                       algorithm_config);
   }
 
-  CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
+  DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
   bool cudnn_launch_status =
       stream
           ->ThenFusedConvolveWithAlgorithm(
 
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
 #include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
 
 
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
 
@@ -32,11 +32,11 @@ namespace functor {
 #define GPUReduceSliceFunctorReduceop(reduceop, beginning)                     \
   template <typename T, typename Index>                                        \
   __global__ void ReduceSliceDeviceKernel##reduceop(                           \
-      Cuda3DLaunchConfig config, Index indices_width, Index bound,             \
+      Gpu3DLaunchConfig config, Index indices_width, Index bound,             \
       const T begin, const Index *indices, const T *input, T *out) {           \
-    CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {               \
-      CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {             \
-        CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {           \
+    GPU_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) {               \
+      GPU_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) {             \
+        GPU_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) {           \
           Index outidx = x * config.virtual_thread_count.y *                   \
                              config.virtual_thread_count.z +                   \
                          y * config.virtual_thread_count.z + z;                \
@@ -68,7 +68,7 @@ namespace functor {
       if (sizex * sizey * sizez == 0) {                                        \
         return;                                                                \
       }                                                                        \
-      Cuda3DLaunchConfig config = GetCuda3DLaunchConfig(                       \
+      Gpu3DLaunchConfig config = GetGpu3DLaunchConfig(                       \
           sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
           0, 0);                                                               \
                                                                                \
 
@@ -23,7 +23,7 @@
 #include <cmath>
 
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
 
@@ -43,7 +43,7 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
                                   const int data_channels,
                                   const int num_sampling_points) {
   const int output_data_size = batch_size * num_sampling_points * data_channels;
-  CUDA_1D_KERNEL_LOOP(index, output_data_size) {
+  GPU_1D_KERNEL_LOOP(index, output_data_size) {
     const int out_index = index;
 
     // Get (idxSample, channel, point) from the index.
@@ -117,8 +117,8 @@ struct Resampler2DFunctor<GPUDevice, T> {
                   const int data_channels, const int num_sampling_points) {
     const int output_data_size =
         batch_size * num_sampling_points * data_channels;
-    ::tensorflow::CudaLaunchConfig config =
-        ::tensorflow::GetCudaLaunchConfig(output_data_size, d);
+    ::tensorflow::GpuLaunchConfig config =
+        ::tensorflow::GetGpuLaunchConfig(output_data_size, d);
     Resampler2DKernel<T>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
             data, warp, output, batch_size, data_height, data_width,
@@ -149,7 +149,7 @@ __global__ void ResamplerGrad2DKernel(
     const int num_sampling_points) {
   const int resampler_output_size =
       batch_size * num_sampling_points * data_channels;
-  CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
+  GPU_1D_KERNEL_LOOP(index, resampler_output_size) {
     const int out_index = index;
 
     // Get (idxSample, channel, point) from the index.
@@ -252,20 +252,20 @@ struct ResamplerGrad2DFunctor<GPUDevice, T> {
     const int grad_data_size =
         batch_size * data_height * data_width * data_channels;
 
-    ::tensorflow::CudaLaunchConfig config =
-        ::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
+    ::tensorflow::GpuLaunchConfig config =
+        ::tensorflow::GetGpuLaunchConfig(grad_warp_size, d);
     ::tensorflow::
         SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
             grad_warp_size, grad_warp);
 
-    config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
+    config = ::tensorflow::GetGpuLaunchConfig(grad_data_size, d);
     ::tensorflow::
         SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
             grad_data_size, grad_data);
 
     const int resampler_output_size =
         batch_size * num_sampling_points * data_channels;
-    config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
+    config = ::tensorflow::GetGpuLaunchConfig(resampler_output_size, d);
     ResamplerGrad2DKernel<T>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
             data, warp, grad_output, grad_data, grad_warp, batch_size,
 
@@ -22,7 +22,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/kernels/eigen_activations.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
 namespace functor {
@@ -186,7 +186,7 @@ void LSTMBlockCellFpropWithCUDA(
     typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix icfo,
     typename TTypes<T>::Matrix h, int batch_size, int cell_size,
     int input_size) {
-  const cudaStream_t& cu_stream = GetCudaStream(ctx);
+  const cudaStream_t& cu_stream = GetGpuStream(ctx);
 
   // Concatenate xh = [x, h].
   //
@@ -321,7 +321,7 @@ void LSTMBlockCellBpropWithCUDA(
     typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,
     typename TTypes<T>::Vec wco_grad, const int batch_size, const int cell_size,
     const bool use_peephole) {
-  const cudaStream_t& cu_stream = GetCudaStream(ctx);
+  const cudaStream_t& cu_stream = GetGpuStream(ctx);
 
   dim3 block_dim_2d(std::min(batch_size, 8), 32);
   dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast<int>(block_dim_2d.x)),
 
@@ -18,7 +18,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
-#include "tensorflow/core/util/cuda_kernel_helper.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
 namespace functor {
@@ -31,7 +31,7 @@ __global__ void GatherTreeOpKernel(const int32 batch_size, const int32 max_time,
                                    const T* parent_ids,
                                    const int32* max_sequence_lengths,
                                    const T end_token, T* beams) {
-  CUDA_1D_KERNEL_LOOP(i, batch_size * beam_width) {
+  GPU_1D_KERNEL_LOOP(i, batch_size * beam_width) {
     const int32 batch = i / beam_width;
     const int32 beam = i % beam_width;
 
@@ -90,7 +90,7 @@ struct GatherTree<GPUDevice, T> {
     // First kernel launch to "zero" things out
     beams.device(d) = beams.constant(end_token);
 
-    CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * beam_width, d);
+    GpuLaunchConfig config = GetGpuLaunchConfig(batch_size * beam_width, d);
     // clang-format off
     GatherTreeOpKernel<T>
         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
 
@@ -33,7 +33,7 @@ void AdjustHueGPU::operator()(GPUDevice* device, const int64 number_of_elements,
   const int threads_per_block = config.thread_per_block;
   const int block_count =
       (number_of_elements + threads_per_block - 1) / threads_per_block;
-  GPU_LAUNCH_KERNEL(internal::adjust_hsv_nhwc<true, false, false>,
+  GPU_LAUNCH_KERNEL((internal::adjust_hsv_nhwc<true, false, false>),
       dim3(block_count), dim3(threads_per_block), 0, stream,
           number_of_elements, input, output, delta, nullptr, nullptr);
 }
Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ void AdjustHueGPU::operator()(GPUDevice* device, const int64 number_of_elements,`
`33`	`33`	`const int threads_per_block = config.thread_per_block;`
`34`	`34`	`const int block_count =`
`35`	`35`	`(number_of_elements + threads_per_block - 1) / threads_per_block;`
`36`		`- GPU_LAUNCH_KERNEL(internal::adjust_hsv_nhwc<true, false, false>,`
	`36`	`+ GPU_LAUNCH_KERNEL((internal::adjust_hsv_nhwc<true, false, false>),`
`37`	`37`	`dim3(block_count), dim3(threads_per_block), 0, stream,`
`38`	`38`	`number_of_elements, input, output, delta, nullptr, nullptr);`
`39`	`39`	`}`