From 8d9a769fc23143be9baf933b590eeb18696c2a67 Mon Sep 17 00:00:00 2001 From: Zachary Streeter <90640993+zstreet87@users.noreply.github.com> Date: Thu, 14 Mar 2024 17:27:50 -0500 Subject: [PATCH] Merge pull request #2450 from ROCm/navi_bug fixed navi bugs --- tensorflow/compiler/mlir/runlit.cfg.py | 2 +- tensorflow/core/kernels/reduction_gpu_kernels.cu.h | 12 ++++++------ tensorflow/core/util/gpu_kernel_helper_test.cu.cc | 6 +++--- .../python/grappler/auto_mixed_precision_test.py | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py index b8ab4b65fb6e02..bdc8e3b21a6fca 100644 --- a/tensorflow/compiler/mlir/runlit.cfg.py +++ b/tensorflow/compiler/mlir/runlit.cfg.py @@ -63,7 +63,7 @@ # Tweak the PATH to include the tools dir. llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True) -for key in ['HIP_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES', +for key in ['ROCM_PATH', 'HIP_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES', 'TF_PER_DEVICE_MEMORY_LIMIT_MB']: value = os.environ.get(key, None) if value != None: diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h index 544c854d572f5a..f0be7d64a9f170 100644 --- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h +++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h @@ -718,14 +718,14 @@ struct GatherOp { }; #if TENSORFLOW_USE_ROCM -inline bool isGfx10(OpKernelContext* ctx) { +inline bool isGfx10orGfx11(OpKernelContext* ctx) { hipDeviceProp_t props; int dev = 0; hipError_t result = hipGetDevice(&dev); result = hipGetDeviceProperties(&props, dev); if (result == hipSuccess) { std::string gcnArchName = props.gcnArchName; - return (gcnArchName.substr(0,5)=="gfx10"); + return (gcnArchName.substr(0,5)=="gfx10" || gcnArchName.substr(0,5)=="gfx11"); } return false; } @@ -736,7 +736,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int in_size, Op op, T init, const gpuStream_t& cu_stream) { #if TENSORFLOW_USE_ROCM - int WARPSIZE = isGfx10(ctx) ? 32 : 64; + int WARPSIZE = isGfx10orGfx11(ctx) ? 32 : 64; #else constexpr int WARPSIZE = TF_RED_WARPSIZE; #endif @@ -808,7 +808,7 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows, int num_cols, Op op, T init, const gpuStream_t& cu_stream) { #if TENSORFLOW_USE_ROCM - int WARPSIZE = isGfx10(ctx) ? 32 : 64; + int WARPSIZE = isGfx10orGfx11(ctx) ? 32 : 64; #else constexpr int WARPSIZE = TF_RED_WARPSIZE; #endif @@ -858,7 +858,7 @@ void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in, int extent_x, int extent_y, Op op, T init, const gpuStream_t& cu_stream) { #if TENSORFLOW_USE_ROCM - int WARPSIZE = (std::is_same::value || isGfx10(ctx)) + int WARPSIZE = (std::is_same::value || isGfx10orGfx11(ctx)) ? 32 : 64; #else constexpr int WARPSIZE = TF_RED_WARPSIZE; @@ -919,7 +919,7 @@ void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in, // On ROCm, TF_RED_WARPSIZE is 64 and the default value would require // 66 kB of shared memory with double complex - more than actually // available in the GPU. - int WARPSIZE = (std::is_same::value || isGfx10(ctx)) + int WARPSIZE = (std::is_same::value || isGfx10orGfx11(ctx)) ? 32 : 64; #else constexpr int WARPSIZE = TF_RED_WARPSIZE; diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc index ad0c2dee2c9dff..6612fa5c225ddb 100644 --- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc +++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc @@ -313,14 +313,14 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) { } #if TENSORFLOW_USE_ROCM -inline bool isGfx10() { +inline bool isGfx10orGfx11() { hipDeviceProp_t props; int dev = 0; hipError_t result = hipGetDevice(&dev); result = hipGetDeviceProperties(&props, dev); if (result == hipSuccess) { std::string gcnArchName = props.gcnArchName; - return (gcnArchName.substr(0,5)=="gfx10"); + return (gcnArchName.substr(0,5)=="gfx10" || gcnArchName.substr(0,5)=="gfx11"); } return false; } @@ -335,7 +335,7 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) { #endif *failure_count = 0; #if TENSORFLOW_USE_ROCM - const int TF_RED_WARPSIZE=isGfx10() ? 32 : 64; + const int TF_RED_WARPSIZE=isGfx10orGfx11() ? 32 : 64; #endif TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0, nullptr, failure_count)); diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py index b5446bd866c335..419fd2c4d6d0d7 100644 --- a/tensorflow/python/grappler/auto_mixed_precision_test.py +++ b/tensorflow/python/grappler/auto_mixed_precision_test.py @@ -583,7 +583,7 @@ def test_conv_pool(self, mode): self._assert_output_f16(mode, node_map, 'Conv2D_1') self.assertEqual(num_to_f16, 4) self.assertEqual(num_to_fp32, 1) - tol = 5e-3 if mode == 'mkl' else 1e-3 + tol = 5e-3 if mode == 'mkl' else 1e-2 self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol) # TODO(benbarsdell): This test has not been tried with MKL.