Skip to content

Commit 44eb3cd

Browse files
whchungsunway513
authored andcommitted
Merge pull request #33 from parallelo/nvfixes
CUDA target-related updates to facilitate upstreaming
1 parent 6ae3ca5 commit 44eb3cd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+175
-151
lines changed

tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ limitations under the License.
2222
#include "llvm/IR/DataLayout.h"
2323
#include "tensorflow/compiler/xla/literal_util.h"
2424
// XXX figure out how to cope with both platforms
25-
#if GOOGLE_CUDA
26-
#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
27-
#elif TENSORFLOW_USE_ROCM
25+
#if TENSORFLOW_USE_ROCM
2826
#include "tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h"
27+
#else
28+
#include "tensorflow/compiler/xla/service/gpu/nvptx_compiler.h"
2929
#endif
3030
#include "tensorflow/compiler/xla/shape_util.h"
3131
#include "tensorflow/compiler/xla/status_macros.h"
@@ -46,13 +46,11 @@ namespace xla {
4646
GpuTransferManager::GpuTransferManager(se::Platform::Id id)
4747
: GenericTransferManager(
4848
id,
49-
// XXX figure out how to cope with both platforms
50-
#if GOOGLE_CUDA
51-
/*pointer_size=*/llvm::DataLayout(gpu::NVPTXCompiler::kDataLayout)
52-
#elif TENSORFLOW_USE_ROCM
53-
/*pointer_size=*/llvm::DataLayout(gpu::AMDGPUCompiler::kDataLayout)
49+
#if TENSORFLOW_USE_ROCM
50+
llvm::DataLayout(gpu::AMDGPUCompiler::kDataLayout).getPointerSize(0)){}
51+
#else
52+
llvm::DataLayout(gpu::NVPTXCompiler::kDataLayout).getPointerSize(0)){}
5453
#endif
55-
.getPointerSize(0 /* default address space */)) {}
5654

5755
Status GpuTransferManager::TransferLiteralToInfeed(
5856
se::StreamExecutor* executor, const LiteralSlice& literal) {

tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License.
1414
==============================================================================*/
1515

16-
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
16+
#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h"
1717

1818
#include <map>
1919
#include <memory>

tensorflow/compiler/xla/service/gpu/nvptx_executable.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
1313
limitations under the License.
1414
==============================================================================*/
1515

16-
#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
16+
#include "tensorflow/compiler/xla/service/gpu/nvptx_executable.h"
1717

1818
#include <set>
1919
#include <utility>
@@ -45,7 +45,7 @@ NVPTXExecutable::NVPTXExecutable(
4545
std::unique_ptr<const BufferAssignment> assignment,
4646
std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
4747
std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
48-
: GpuExecutable(std::move(text), std::move(think_schedule),
48+
: GpuExecutable(std::move(text), std::move(thunk_schedule),
4949
std::move(hlo_module), std::move(assignment),
5050
std::move(hlo_profile_printer_data),
5151
std::move(hlo_profile_index_map)),

tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
516516
auto bias_ptr = AsDeviceMemory(bias.template flat<BiasType>().data(),
517517
bias.template flat<BiasType>().size());
518518

519-
static int64 ConvolveScratchSize = GetCudnnWorkspaceLimit(
519+
static int64 ConvolveScratchSize = GetDnnWorkspaceLimit(
520520
// default value is in bytes despite the name of the environment variable
521521
"TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
522522
);
@@ -551,7 +551,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
551551
for (auto profile_algorithm : algorithms) {
552552
// TODO(zhengxq): profile each algorithm multiple times to better
553553
// accuracy.
554-
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
554+
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
555555
dnn::ProfileResult profile_result;
556556
bool cudnn_launch_status =
557557
stream
@@ -591,7 +591,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
591591
algorithm_config);
592592
}
593593

594-
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
594+
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
595595
bool cudnn_launch_status =
596596
stream
597597
->ThenFusedConvolveWithAlgorithm(

tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ limitations under the License.
1818
#include "tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.h"
1919
#include "tensorflow/core/kernels/gpu_utils.h"
2020
#include "tensorflow/core/platform/stream_executor.h"
21-
#include "tensorflow/core/util/cuda_kernel_helper.h"
21+
#include "tensorflow/core/util/gpu_kernel_helper.h"
2222

2323
namespace tensorflow {
2424

tensorflow/contrib/reduce_slice_ops/kernels/reduce_slice_ops_gpu.cu.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ limitations under the License.
2121
#include "tensorflow/core/framework/op.h"
2222
#include "tensorflow/core/framework/op_kernel.h"
2323
#include "tensorflow/core/framework/register_types.h"
24-
#include "tensorflow/core/util/cuda_kernel_helper.h"
24+
#include "tensorflow/core/util/gpu_kernel_helper.h"
2525

2626
namespace tensorflow {
2727

@@ -32,11 +32,11 @@ namespace functor {
3232
#define GPUReduceSliceFunctorReduceop(reduceop, beginning) \
3333
template <typename T, typename Index> \
3434
__global__ void ReduceSliceDeviceKernel##reduceop( \
35-
Cuda3DLaunchConfig config, Index indices_width, Index bound, \
35+
Gpu3DLaunchConfig config, Index indices_width, Index bound, \
3636
const T begin, const Index *indices, const T *input, T *out) { \
37-
CUDA_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { \
38-
CUDA_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) { \
39-
CUDA_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) { \
37+
GPU_AXIS_KERNEL_LOOP(x, config.virtual_thread_count.x, X) { \
38+
GPU_AXIS_KERNEL_LOOP(y, config.virtual_thread_count.y, Y) { \
39+
GPU_AXIS_KERNEL_LOOP(z, config.virtual_thread_count.z, Z) { \
4040
Index outidx = x * config.virtual_thread_count.y * \
4141
config.virtual_thread_count.z + \
4242
y * config.virtual_thread_count.z + z; \
@@ -68,7 +68,7 @@ namespace functor {
6868
if (sizex * sizey * sizez == 0) { \
6969
return; \
7070
} \
71-
Cuda3DLaunchConfig config = GetCuda3DLaunchConfig( \
71+
Gpu3DLaunchConfig config = GetGpu3DLaunchConfig( \
7272
sizex, sizey, sizez, d, ReduceSliceDeviceKernel##reduceop<T, Index>, \
7373
0, 0); \
7474
\

tensorflow/contrib/resampler/kernels/resampler_ops_gpu.cu.cc

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#include <cmath>
2424

2525
#include "tensorflow/core/framework/register_types.h"
26-
#include "tensorflow/core/util/cuda_kernel_helper.h"
26+
#include "tensorflow/core/util/gpu_kernel_helper.h"
2727

2828
namespace tensorflow {
2929

@@ -43,7 +43,7 @@ __global__ void Resampler2DKernel(const T* __restrict__ data,
4343
const int data_channels,
4444
const int num_sampling_points) {
4545
const int output_data_size = batch_size * num_sampling_points * data_channels;
46-
CUDA_1D_KERNEL_LOOP(index, output_data_size) {
46+
GPU_1D_KERNEL_LOOP(index, output_data_size) {
4747
const int out_index = index;
4848

4949
// Get (idxSample, channel, point) from the index.
@@ -117,8 +117,8 @@ struct Resampler2DFunctor<GPUDevice, T> {
117117
const int data_channels, const int num_sampling_points) {
118118
const int output_data_size =
119119
batch_size * num_sampling_points * data_channels;
120-
::tensorflow::CudaLaunchConfig config =
121-
::tensorflow::GetCudaLaunchConfig(output_data_size, d);
120+
::tensorflow::GpuLaunchConfig config =
121+
::tensorflow::GetGpuLaunchConfig(output_data_size, d);
122122
Resampler2DKernel<T>
123123
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
124124
data, warp, output, batch_size, data_height, data_width,
@@ -149,7 +149,7 @@ __global__ void ResamplerGrad2DKernel(
149149
const int num_sampling_points) {
150150
const int resampler_output_size =
151151
batch_size * num_sampling_points * data_channels;
152-
CUDA_1D_KERNEL_LOOP(index, resampler_output_size) {
152+
GPU_1D_KERNEL_LOOP(index, resampler_output_size) {
153153
const int out_index = index;
154154

155155
// Get (idxSample, channel, point) from the index.
@@ -252,20 +252,20 @@ struct ResamplerGrad2DFunctor<GPUDevice, T> {
252252
const int grad_data_size =
253253
batch_size * data_height * data_width * data_channels;
254254

255-
::tensorflow::CudaLaunchConfig config =
256-
::tensorflow::GetCudaLaunchConfig(grad_warp_size, d);
255+
::tensorflow::GpuLaunchConfig config =
256+
::tensorflow::GetGpuLaunchConfig(grad_warp_size, d);
257257
::tensorflow::
258258
SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
259259
grad_warp_size, grad_warp);
260260

261-
config = ::tensorflow::GetCudaLaunchConfig(grad_data_size, d);
261+
config = ::tensorflow::GetGpuLaunchConfig(grad_data_size, d);
262262
::tensorflow::
263263
SetZero<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
264264
grad_data_size, grad_data);
265265

266266
const int resampler_output_size =
267267
batch_size * num_sampling_points * data_channels;
268-
config = ::tensorflow::GetCudaLaunchConfig(resampler_output_size, d);
268+
config = ::tensorflow::GetGpuLaunchConfig(resampler_output_size, d);
269269
ResamplerGrad2DKernel<T>
270270
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
271271
data, warp, grad_output, grad_data, grad_warp, batch_size,

tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ limitations under the License.
2222
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
2323
#include "tensorflow/core/kernels/eigen_activations.h"
2424
#include "tensorflow/core/platform/logging.h"
25-
#include "tensorflow/core/util/cuda_kernel_helper.h"
25+
#include "tensorflow/core/util/gpu_kernel_helper.h"
2626

2727
namespace tensorflow {
2828
namespace functor {
@@ -186,7 +186,7 @@ void LSTMBlockCellFpropWithCUDA(
186186
typename TTypes<T>::Matrix co, typename TTypes<T>::Matrix icfo,
187187
typename TTypes<T>::Matrix h, int batch_size, int cell_size,
188188
int input_size) {
189-
const cudaStream_t& cu_stream = GetCudaStream(ctx);
189+
const cudaStream_t& cu_stream = GetGpuStream(ctx);
190190

191191
// Concatenate xh = [x, h].
192192
//
@@ -321,7 +321,7 @@ void LSTMBlockCellBpropWithCUDA(
321321
typename TTypes<T>::Vec wci_grad, typename TTypes<T>::Vec wcf_grad,
322322
typename TTypes<T>::Vec wco_grad, const int batch_size, const int cell_size,
323323
const bool use_peephole) {
324-
const cudaStream_t& cu_stream = GetCudaStream(ctx);
324+
const cudaStream_t& cu_stream = GetGpuStream(ctx);
325325

326326
dim3 block_dim_2d(std::min(batch_size, 8), 32);
327327
dim3 grid_dim_2d(Eigen::divup(batch_size, static_cast<int>(block_dim_2d.x)),

tensorflow/contrib/seq2seq/kernels/beam_search_ops_gpu.cu.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ limitations under the License.
1818
#define EIGEN_USE_GPU
1919

2020
#include "tensorflow/contrib/seq2seq/kernels/beam_search_ops.h"
21-
#include "tensorflow/core/util/cuda_kernel_helper.h"
21+
#include "tensorflow/core/util/gpu_kernel_helper.h"
2222

2323
namespace tensorflow {
2424
namespace functor {
@@ -31,7 +31,7 @@ __global__ void GatherTreeOpKernel(const int32 batch_size, const int32 max_time,
3131
const T* parent_ids,
3232
const int32* max_sequence_lengths,
3333
const T end_token, T* beams) {
34-
CUDA_1D_KERNEL_LOOP(i, batch_size * beam_width) {
34+
GPU_1D_KERNEL_LOOP(i, batch_size * beam_width) {
3535
const int32 batch = i / beam_width;
3636
const int32 beam = i % beam_width;
3737

@@ -90,7 +90,7 @@ struct GatherTree<GPUDevice, T> {
9090
// First kernel launch to "zero" things out
9191
beams.device(d) = beams.constant(end_token);
9292

93-
CudaLaunchConfig config = GetCudaLaunchConfig(batch_size * beam_width, d);
93+
GpuLaunchConfig config = GetGpuLaunchConfig(batch_size * beam_width, d);
9494
// clang-format off
9595
GatherTreeOpKernel<T>
9696
<<<config.block_count, config.thread_per_block, 0, d.stream()>>>(

tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ void AdjustHueGPU::operator()(GPUDevice* device, const int64 number_of_elements,
3333
const int threads_per_block = config.thread_per_block;
3434
const int block_count =
3535
(number_of_elements + threads_per_block - 1) / threads_per_block;
36-
GPU_LAUNCH_KERNEL(internal::adjust_hsv_nhwc<true, false, false>,
36+
GPU_LAUNCH_KERNEL((internal::adjust_hsv_nhwc<true, false, false>),
3737
dim3(block_count), dim3(threads_per_block), 0, stream,
3838
number_of_elements, input, output, delta, nullptr, nullptr);
3939
}

0 commit comments

Comments
 (0)