ROCm
diff --git a/‎rocm_docs/core_kernels.md‎
Lines changed: 32 additions & 32 deletions b/‎rocm_docs/core_kernels.md‎
Lines changed: 32 additions & 32 deletions
diff --git a/‎tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc‎
Lines changed: 39 additions & 23 deletions b/‎tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc‎
Lines changed: 39 additions & 23 deletions
@@ -441,7 +441,7 @@ Kernels under tensorflow/core/kernels
 |   O    |        | dilation_ops_gpu.cu.cc
 |   O    |        | draw_bounding_box_op.cc
 |   O    |        | dynamic_partition_op.cc
-|   X    |cub     | dynamic_partition_op_gpu.cu.cc
+|   P    |rocPRIM | dynamic_partition_op_gpu.cu.cc
 |   O    |        | dynamic_partition_op_test.cc
 |   O    |        | dynamic_stitch_op.cc
 |   O    |        | dynamic_stitch_op_gpu.cu.cc
@@ -581,9 +581,9 @@ Kernels under tensorflow/core/kernels
 |   O    |        | inplace_ops.cc
 |   O    |        | inplace_ops_functor.h
 |   O    |        | inplace_ops_functor_gpu.cu.cc
-|   P    |cub     | l2loss_op.cc
-|   P    |cub     | l2loss_op.h
-|   P    |cub     | l2loss_op_gpu.cu.cc
+|   P    |rocPRIM | l2loss_op.cc
+|   P    |rocPRIM | l2loss_op.h
+|   P    |rocPRIM | l2loss_op_gpu.cu.cc
 |   O    |        | linalg_ops_common.cc
 |   O    |        | linalg_ops_common.h
 |   O    |        | list_kernels.cc
@@ -796,26 +796,26 @@ Kernels under tensorflow/core/kernels
 |   O    |        | record_yielder.cc
 |   O    |        | record_yielder.h
 |   O    |        | reduce_join_op.cc
-|   P    |cub     | reduction_gpu_kernels.cu.h
-|   P    |cub     | reduction_ops.h
-|   P    |cub     | reduction_ops_all.cc
-|   P    |cub     | reduction_ops_any.cc
-|   P    |cub     | reduction_ops_common.cc
-|   P    |cub     | reduction_ops_common.h
-|   P    |cub     | reduction_ops_gpu_bool.cu.cc
+|   P    |rocPRIM | reduction_gpu_kernels.cu.h
+|   P    |rocPRIM | reduction_ops.h
+|   P    |rocPRIM | reduction_ops_all.cc
+|   P    |rocPRIM | reduction_ops_any.cc
+|   P    |rocPRIM | reduction_ops_common.cc
+|   P    |rocPRIM | reduction_ops_common.h
+|   P    |rocPRIM | reduction_ops_gpu_bool.cu.cc
 |   X    |cub     | reduction_ops_gpu_complex128.cu.cc
 |   X    |cub     | reduction_ops_gpu_complex64.cu.cc
-|   P    |cub     | reduction_ops_gpu_double.cu.cc
-|   P    |cub     | reduction_ops_gpu_float.cu.cc
-|   P    |cub     | reduction_ops_gpu_int.cu.cc
-|   P    |cub     | reduction_ops_half_mean_sum.cu.cc
-|   P    |cub     | reduction_ops_half_prod_max_min.cu.cc
-|   P    |cub     | reduction_ops_max.cc
-|   P    |cub     | reduction_ops_mean.cc
-|   P    |cub     | reduction_ops_min.cc
-|   P    |cub     | reduction_ops_prod.cc
-|   P    |cub     | reduction_ops_sum.cc
-|   P    |cub     | reduction_ops_test.cc
+|   P    |rocPRIM | reduction_ops_gpu_double.cu.cc
+|   P    |rocPRIM | reduction_ops_gpu_float.cu.cc
+|   P    |rocPRIM | reduction_ops_gpu_int.cu.cc
+|   P    |rocPRIM | reduction_ops_half_mean_sum.cu.cc
+|   P    |rocPRIM | reduction_ops_half_prod_max_min.cu.cc
+|   P    |rocPRIM | reduction_ops_max.cc
+|   P    |rocPRIM | reduction_ops_mean.cc
+|   P    |rocPRIM | reduction_ops_min.cc
+|   P    |rocPRIM | reduction_ops_prod.cc
+|   P    |rocPRIM | reduction_ops_sum.cc
+|   P    |rocPRIM | reduction_ops_test.cc
 |   O    |        | reference_gemm.h
 |   O    |        | regex_replace_op.cc
 |   O    |        | relu_op.cc
@@ -940,9 +940,9 @@ Kernels under tensorflow/core/kernels
 |   O    |        | snapshot_op.cc
 |   O    |        | snapshot_op.h
 |   O    |        | snapshot_op_gpu.cu.cc
-|   P    |cub     | softmax_op.cc
-|   P    |cub     | softmax_op_functor.h
-|   P    |cub     | softmax_op_gpu.cu.cc
+|   P    |rocPRIM | softmax_op.cc
+|   P    |rocPRIM | softmax_op_functor.h
+|   P    |rocPRIM | softmax_op_gpu.cu.cc
 |   O    |        | softplus_op.cc
 |   O    |        | softplus_op.h
 |   O    |        | softplus_op_gpu.cu.cc
@@ -1107,13 +1107,13 @@ Kernels under tensorflow/core/kernels
 |   O    |        | warn_about_ints.cc
 |   O    |        | warn_about_ints.h
 |   X    |cub     | where_op.cc
-|   X    |cub     | where_op.h
-|   X    |cub     | where_op_gpu.cu.h
-|   X    |cub     | where_op_gpu_impl_1.cu.cc
-|   X    |cub     | where_op_gpu_impl_2.cu.cc
-|   X    |cub     | where_op_gpu_impl_3.cu.cc
-|   X    |cub     | where_op_gpu_impl_4.cu.cc
-|   X    |cub     | where_op_gpu_impl_5.cu.cc
+|   O    |        | where_op.h
+|   P    |rocPRIM | where_op_gpu.cu.h
+|   P    |rocPRIM | where_op_gpu_impl_1.cu.cc
+|   P    |rocPRIM | where_op_gpu_impl_2.cu.cc
+|   P    |rocPRIM | where_op_gpu_impl_3.cu.cc
+|   P    |rocPRIM | where_op_gpu_impl_4.cu.cc
+|   P    |rocPRIM | where_op_gpu_impl_5.cu.cc
 |   O    |        | whole_file_read_ops.cc
 |   O    |        | winograd_transform.h
 |   O    |        | word2vec_kernels.cc
 
@@ -16,12 +16,12 @@ limitations under the License.
 // The algorithm for dynamic partition has the following steps:
 // 1. Let N be the size of partitions. We initialize a new vector indices_in
 //    with the values 0, 1, 2, ..., N-1.
-// 2. We apply cub::DeviceRadixSort::SortPairs to the key - value pairs given
+// 2. We apply gpuprim::DeviceRadixSort::SortPairs to the key - value pairs given
 //    by partitions and indices_in. This will result in two new vectors
 //    partitions_out and indices_out, with partitions_out sorted.
 // 3. The first dimension of outputs[i] is equal to the number of i-values in
 //    partitions_out. We determine it in two steps:
-//    - apply cub::DeviceReduce::ReduceByKey to count how many times each value
+//    - apply gpuprim::DeviceReduce::ReduceByKey to count how many times each value
 //      appears in partitions_out,
 //    - move the results to partition_count. This handles missing values
 //      (corresponding to empty parts).
@@ -31,14 +31,18 @@ limitations under the License.
 //    This works, because for each interval of i-values, indices_out points
 //    to the slices which should form output[i].
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #define EIGEN_USE_GPU
 
+#if GOOGLE_CUDA
 #include "external/cub_archive/cub/device/device_radix_sort.cuh"
 #include "external/cub_archive/cub/device/device_reduce.cuh"
 #include "external/cub_archive/cub/iterator/constant_input_iterator.cuh"
 #include "external/cub_archive/cub/thread/thread_operators.cuh"
+#elif TENSORFLOW_USE_ROCM
+#include "external/rocprim_archive/hipcub/include/hipcub/hipcub.hpp"
+#endif
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -50,6 +54,12 @@ limitations under the License.
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/transform_output_iterator.h"
 
+#if GOOGLE_CUDA
+namespace gpuprim = ::cub;
+#elif TENSORFLOW_USE_ROCM
+namespace gpuprim = ::hipcub;
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -59,14 +69,14 @@ namespace {
 template <typename T>
 __global__ void RangeInitKernel(const T start, const T delta, const int32 size,
                                 T* out) {
-  CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
+  GPU_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; }
 }
 
 __global__ void MoveValuesKernel(const int32* keys, const int32* values,
                                  const int32* size, int32 out_size,
                                  int32* out) {
   int32 N = min(ldg(size), out_size);
-  CUDA_1D_KERNEL_LOOP(i, N) {
+  GPU_1D_KERNEL_LOOP(i, N) {
     int32 key = ldg(keys + i);
     int32 value = ldg(values + i);
     if (FastBoundsCheck(key, out_size)) out[key] = value;
@@ -78,9 +88,9 @@ __global__ void MoveValuesKernel(const int32* keys, const int32* values,
 template <typename T>
 void RangeInit(const GPUDevice& d, const T start, const T delta,
                const int32 size, typename TTypes<T>::Flat out) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(size, d);
-  RangeInitKernel<T>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+  GpuLaunchConfig config = GetGpuLaunchConfig(size, d);
+  GPU_LAUNCH_KERNEL(RangeInitKernel<T>, dim3(config.block_count),
+          dim3(config.thread_per_block), 0, d.stream(),
           start, delta, size, out.data());
 }
 
@@ -93,18 +103,19 @@ void MoveValues(const GPUDevice& d, int32* keys, int32* values, int32* num_runs,
   // This is valid for correct inputs, because then out_size >= *num_runs.
   // For wrong inputs, we may have out_size < *num_runs. In this case we will
   // only handle the first out_size values.
-  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  MoveValuesKernel<<<config.block_count, config.thread_per_block, 0,
-                     d.stream()>>>(keys, values, num_runs, out_size, out);
+  GpuLaunchConfig config = GetGpuLaunchConfig(out_size, d);
+  GPU_LAUNCH_KERNEL(MoveValuesKernel, dim3(config.block_count),
+          dim3(config.thread_per_block), 0, d.stream(), keys, values,
+          num_runs, out_size, out);
 }
 
 template <typename T>
 void CallGatherKernel(const GPUDevice& d, const T* params, const int32* indices,
                       T* out, int64 gather_dim_size, int64 indices_size,
                       int64 slice_size, int64 out_size) {
-  CudaLaunchConfig config = GetCudaLaunchConfig(out_size, d);
-  GatherOpKernel<T, int32, true>
-      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+  GpuLaunchConfig config = GetGpuLaunchConfig(out_size, d);  
+  GPU_LAUNCH_KERNEL(GatherOpKernel<T, int32, true>,
+          dim3(config.block_count), dim3(config.thread_per_block), 0, d.stream(),
           params, indices, out, gather_dim_size, indices_size, slice_size,
           out_size);
 }
@@ -180,7 +191,7 @@ class BoundedOutputIterator
 // I + P + max(3N + R + P, O + N), where:
 // I - the size of the input
 // N - the size of the partitions tensor
-// R - the temporary storage used by cub::RadixSort, about 2N
+// R - the temporary storage used by gpuprim::RadixSort, about 2N
 // P - the number of partitions
 // O - the size of the output
 // So roughly the cost is I + P + max(5N, O + N).
@@ -325,7 +336,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                  Tensor* indices_out, DoneCallback done) {
     int32 N = partitions->NumElements();
     const GPUDevice& device = c->eigen_device<GPUDevice>();
-    const cudaStream_t& cu_stream = GetCudaStream(c);
+    const gpuStream_t& cu_stream = GetGPUStream(c);
 
     // Initialize the indices_in tensor using the Range GPU kernel.
     RangeInit(device, 0, 1, N, indices_in->flat<int32>());
@@ -337,7 +348,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     // Determine temporary device storage requirements.
     Tensor cub_temp_storage;
     size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs(
+    gpuprim::DeviceRadixSort::SortPairs(
         NULL, temp_storage_bytes, partitions_ptr, partitions_out_ptr,
         indices_in_ptr, indices_out_ptr, N, 0, sizeof(int32) * 8, cu_stream);
     // Allocate temporary storage.
@@ -348,7 +359,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                          &cub_temp_storage),
         done);
     // Radix-sort the partition information.
-    cub::DeviceRadixSort::SortPairs(
+    gpuprim::DeviceRadixSort::SortPairs(
         cub_temp_storage.flat<int8>().data(), temp_storage_bytes,
         partitions_ptr, partitions_out_ptr, indices_in_ptr, indices_out_ptr, N,
         0, sizeof(int32) * 8, cu_stream);
@@ -358,7 +369,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
                          Tensor* partition_count, Tensor* indices_out,
                          DoneCallback done) {
     const GPUDevice& device = c->eigen_device<GPUDevice>();
-    const cudaStream_t& cu_stream = GetCudaStream(c);
+    const gpuStream_t& cu_stream = GetGPUStream(c);
     int32 N = partitions->NumElements();
     Tensor indices_in;
     Tensor partitions_out;
@@ -395,8 +406,13 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     BoundedOutputIterator aggregates_out_it(aggregates_out_ptr, id_op,
                                             num_partitions_);
 
+#if GOOGLE_CUDA
     cub::ConstantInputIterator<int32> values_in(1);
-    cub::Sum reduction_op;
+#elif TENSORFLOW_USE_ROCM
+    using ConstantInputIterator = ::rocprim::constant_iterator<int32, ptrdiff_t>;
+    ConstantInputIterator values_in(1);
+#endif
+    gpuprim::Sum reduction_op;
 
     // Allocate space on GPU for the number of runs. This is required by CUB.
     Tensor num_runs;
@@ -407,7 +423,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     // Determine temporary device storage requirements
     Tensor cub_temp_storage;
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::ReduceByKey(NULL, temp_storage_bytes, keys_in_ptr,
+    gpuprim::DeviceReduce::ReduceByKey(NULL, temp_storage_bytes, keys_in_ptr,
                                    unique_out_it, values_in, aggregates_out_it,
                                    num_runs_ptr, reduction_op, N, cu_stream);
     // Allocate temporary storage.
@@ -421,7 +437,7 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     // each index appears in partitions. The distinct indices are stored
     // in unique_out, while the count is stored in aggregates_out.
     // The total number of distinct indices is stored in num_runs.
-    cub::DeviceReduce::ReduceByKey(cub_temp_storage.flat<int8>().data(),
+    gpuprim::DeviceReduce::ReduceByKey(cub_temp_storage.flat<int8>().data(),
                                    temp_storage_bytes, keys_in_ptr,
                                    unique_out_it, values_in, aggregates_out_it,
                                    num_runs_ptr, reduction_op, N, cu_stream);
@@ -467,4 +483,4 @@ TF_CALL_complex128(REGISTER_DYNAMIC_PARTITION_GPU);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM