Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

r2.14-rocm-enhanced] Fix Navi bugs #2462

Merged
merged 1 commit into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tensorflow/compiler/mlir/runlit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
# Tweak the PATH to include the tools dir.
llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)

for key in ['HIP_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES',
for key in ['ROCM_PATH', 'HIP_VISIBLE_DEVICES', 'CUDA_VISIBLE_DEVICES',
'TF_PER_DEVICE_MEMORY_LIMIT_MB']:
value = os.environ.get(key, None)
if value != None:
Expand Down
12 changes: 6 additions & 6 deletions tensorflow/core/kernels/reduction_gpu_kernels.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -718,14 +718,14 @@ struct GatherOp {
};

#if TENSORFLOW_USE_ROCM
inline bool isGfx10(OpKernelContext* ctx) {
inline bool isGfx10orGfx11(OpKernelContext* ctx) {
hipDeviceProp_t props;
int dev = 0;
hipError_t result = hipGetDevice(&dev);
result = hipGetDeviceProperties(&props, dev);
if (result == hipSuccess) {
std::string gcnArchName = props.gcnArchName;
return (gcnArchName.substr(0,5)=="gfx10");
return (gcnArchName.substr(0,5)=="gfx10" || gcnArchName.substr(0,5)=="gfx11");
}
return false;
}
Expand All @@ -736,7 +736,7 @@ void LaunchScalarReduction(OpKernelContext* ctx, OUT_T out, IN_T in,
int in_size, Op op, T init,
const gpuStream_t& cu_stream) {
#if TENSORFLOW_USE_ROCM
int WARPSIZE = isGfx10(ctx) ? 32 : 64;
int WARPSIZE = isGfx10orGfx11(ctx) ? 32 : 64;
#else
constexpr int WARPSIZE = TF_RED_WARPSIZE;
#endif
Expand Down Expand Up @@ -808,7 +808,7 @@ void LaunchRowReduction(OpKernelContext* ctx, OUT_T out, IN_T in, int num_rows,
int num_cols, Op op, T init,
const gpuStream_t& cu_stream) {
#if TENSORFLOW_USE_ROCM
int WARPSIZE = isGfx10(ctx) ? 32 : 64;
int WARPSIZE = isGfx10orGfx11(ctx) ? 32 : 64;
#else
constexpr int WARPSIZE = TF_RED_WARPSIZE;
#endif
Expand Down Expand Up @@ -858,7 +858,7 @@ void LaunchColumnReduction_LTE16Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
int extent_x, int extent_y, Op op, T init,
const gpuStream_t& cu_stream) {
#if TENSORFLOW_USE_ROCM
int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10(ctx))
int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10orGfx11(ctx))
? 32 : 64;
#else
constexpr int WARPSIZE = TF_RED_WARPSIZE;
Expand Down Expand Up @@ -919,7 +919,7 @@ void LaunchColumnReduction_LTE4096Cols(OpKernelContext* ctx, OUT_T out, IN_T in,
// On ROCm, TF_RED_WARPSIZE is 64 and the default value would require
// 66 kB of shared memory with double complex - more than actually
// available in the GPU.
int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10(ctx))
int WARPSIZE = (std::is_same<T, hipDoubleComplex>::value || isGfx10orGfx11(ctx))
? 32 : 64;
#else
constexpr int WARPSIZE = TF_RED_WARPSIZE;
Expand Down
6 changes: 3 additions & 3 deletions tensorflow/core/util/gpu_kernel_helper_test.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -313,14 +313,14 @@ TEST_F(GpuLaunchConfigTest, GetGpu3DLaunchConfig) {
}

#if TENSORFLOW_USE_ROCM
inline bool isGfx10() {
inline bool isGfx10orGfx11() {
hipDeviceProp_t props;
int dev = 0;
hipError_t result = hipGetDevice(&dev);
result = hipGetDeviceProperties(&props, dev);
if (result == hipSuccess) {
std::string gcnArchName = props.gcnArchName;
return (gcnArchName.substr(0,5)=="gfx10");
return (gcnArchName.substr(0,5)=="gfx10" || gcnArchName.substr(0,5)=="gfx11");
}
return false;
}
Expand All @@ -335,7 +335,7 @@ TEST(CudaDeviceFunctionsTest, ShuffleGetSrcLane) {
#endif
*failure_count = 0;
#if TENSORFLOW_USE_ROCM
const int TF_RED_WARPSIZE=isGfx10() ? 32 : 64;
const int TF_RED_WARPSIZE=isGfx10orGfx11() ? 32 : 64;
#endif
TF_EXPECT_OK(GpuLaunchKernel(GpuShuffleGetSrcLaneTest, 1, TF_RED_WARPSIZE, 0,
nullptr, failure_count));
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/python/grappler/auto_mixed_precision_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ def test_conv_pool(self, mode):
self._assert_output_f16(mode, node_map, 'Conv2D_1')
self.assertEqual(num_to_f16, 4)
self.assertEqual(num_to_fp32, 1)
tol = 5e-3 if mode == 'mkl' else 1e-3
tol = 5e-3 if mode == 'mkl' else 1e-2
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)

# TODO(benbarsdell): This test has not been tried with MKL.
Expand Down
Loading