From 8d9a769fc23143be9baf933b590eeb18696c2a67 Mon Sep 17 00:00:00 2001
From: Zachary Streeter <90640993+zstreet87@users.noreply.github.com>
Date: Thu, 14 Mar 2024 17:27:50 -0500
Subject: [PATCH] Merge pull request #2450 from ROCm/navi_bug

fixed navi bugs
---
 tensorflow/compiler/mlir/runlit.cfg.py               |  2 +-
 tensorflow/core/kernels/reduction_gpu_kernels.cu.h   | 12 ++++++------
 tensorflow/core/util/gpu_kernel_helper_test.cu.cc    |  6 +++---
 .../python/grappler/auto_mixed_precision_test.py     |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index b8ab4b65fb6e02..bdc8e3b21a6fca 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -63,7 +63,7 @@
 # Tweak the PATH to include the tools dir.
 llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
 
-for key in ['HIP_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES',
+for key in ['ROCM_PATH', 'HIP_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES',
             'TF_PER_DEVICE_MEMORY_LIMIT_MB']:
   value = os.environ.get(key, None)
   if value != None:
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 544c854d572f5a..f0be7d64a9f170 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -718,14 +718,14 @@ struct GatherOp {
 };
 
 #if TENSORFLOW_USE_ROCM
-inline bool isGfx10(OpKernelContext* ctx) {
+inline bool isGfx10orGfx11(OpKernelContext* ctx) {
   hipDeviceProp_t props;
   int dev = 0;
   hipError_t result = hipGetDevice(&dev);
   result = hipGetDeviceProperties(&props, dev);
   if (result == hipSuccess) {
     std::string gcnArchName = props.gcnArchName;
-    return (gcnArchName.substr(0,5)=="gfx10");
+    return (gcnArchName.substr(0,5)=="gfx10" || gcnArchName.substr(0,5)=="gfx11");
   }
   return false;
 }
@@ -736,7 +736,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
                            int in_size, Op op, T init,
                            const gpuStream_t& cu_stream) {
 #if TENSORFLOW_USE_ROCM
-  int WARPSIZE = isGfx10(ctx) ? 32 : 64;
+  int WARPSIZE = isGfx10orGfx11(ctx) ? 32 : 64;
 #else
   constexpr int WARPSIZE = TF_RED_WARPSIZE;
 #endif
@@ -808,7 +808,7 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
                         int num_cols, Op op, T init,
                         const gpuStream_t& cu_stream) {
 #if TENSORFLOW_USE_ROCM
-  int WARPSIZE = isGfx10(ctx) ? 32 : 64;
+  int WARPSIZE = isGfx10orGfx11(ctx) ? 32 : 64;
 #else
   constexpr int WARPSIZE = TF_RED_WARPSIZE;
 #endif
@@ -858,7 +858,7 @@ void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
                                      int extent_x, int extent_y, Op op, T init,
                                      const gpuStream_t& cu_stream) {
 #if TENSORFLOW_USE_ROCM
-  int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10(ctx))
+  int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10orGfx11(ctx))
       ? 32 : 64;
 #else
   constexpr int WARPSIZE = TF_RED_WARPSIZE;
@@ -919,7 +919,7 @@ void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
   // On ROCm, TF_RED_WARPSIZE is 64 and the default value would require
   // 66 kB of shared memory with double complex - more than actually
   // available in the GPU.
-  int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10(ctx))
+  int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10orGfx11(ctx))
       ? 32 : 64;
 #else
   constexpr int WARPSIZE = TF_RED_WARPSIZE;
diff --git a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
index ad0c2dee2c9dff..6612fa5c225ddb 100644
--- a/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
+++ b/tensorflow/core/util/gpu_kernel_helper_test.cu.cc
@@ -313,14 +313,14 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
 }
 
 #if TENSORFLOW_USE_ROCM
-inline bool isGfx10() {
+inline bool isGfx10orGfx11() {
   hipDeviceProp_t props;
   int dev = 0;
   hipError_t result = hipGetDevice(&dev);
   result = hipGetDeviceProperties(&props, dev);
   if (result == hipSuccess) {
     std::string gcnArchName = props.gcnArchName;
-    return (gcnArchName.substr(0,5)=="gfx10");
+    return (gcnArchName.substr(0,5)=="gfx10" || gcnArchName.substr(0,5)=="gfx11");
   }
   return false;
 }
@@ -335,7 +335,7 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
 #endif
   *failure_count = 0;
 #if TENSORFLOW_USE_ROCM
-  const int TF_RED_WARPSIZE=isGfx10() ? 32 : 64;
+  const int TF_RED_WARPSIZE=isGfx10orGfx11() ? 32 : 64;
 #endif
   TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0,
                                nullptr, failure_count));
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index b5446bd866c335..419fd2c4d6d0d7 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -583,7 +583,7 @@ def test_conv_pool(self, mode):
     self._assert_output_f16(mode, node_map, 'Conv2D_1')
     self.assertEqual(num_to_f16, 4)
     self.assertEqual(num_to_fp32, 1)
-    tol = 5e-3 if mode == 'mkl' else 1e-3
+    tol = 5e-3 if mode == 'mkl' else 1e-2
     self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
   # TODO(benbarsdell): This test has not been tried with MKL.