From bc7f9675f1e342b8ed1dcd24e7bd6a028f344aec Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Fri, 10 Dec 2021 17:57:31 +0000 Subject: [PATCH 1/6] [SYCL][CUDA] Add -fcuda-prec-sqrt flag This patch add `__nvvm_reflect` support for `__CUDA_PREC_SQRT` and adds a `-Xclang -fcuda-prec-sqrt` flag which is equivalent to the `nvcc` `-prec-sqrt` flag, except that it defaults to `false` for `clang++` and to `true` for `nvcc`. The reason for that is that the SYCL specification doesn't require a correctly rounded `sqrt` so we likely want to keep the fast `sqrt` as a default and use the flag when higher precision is required. See additional discussion on #4041 and #5116 --- clang/include/clang/Basic/TargetOptions.h | 3 +++ clang/include/clang/Driver/Options.td | 5 +++++ clang/lib/CodeGen/CodeGenModule.cpp | 4 +++- llvm/lib/Target/NVPTX/NVVMReflect.cpp | 5 +++++ sycl/doc/GetStartedGuide.md | 6 ++++++ 5 files changed, 22 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index 81c15adb8248e..07542feb9d814 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -75,6 +75,9 @@ class TargetOptions { /// address space. bool NVPTXUseShortPointers = false; + /// \brief If enabled, use precise square root + bool NVVMCudaPrecSqrt = false; + /// \brief If enabled, allow AMDGPU unsafe floating point atomics. bool AllowAMDGPUUnsafeFPAtomics = false; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d6c3ca7d9c4ea..8d9b18cb74c29 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -980,6 +980,11 @@ defm cuda_short_ptr : BoolFOption<"cuda-short-ptr", TargetOpts<"NVPTXUseShortPointers">, DefaultFalse, PosFlag, NegFlag>; +defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt", + TargetOpts<"NVVMCudaPrecSqrt">, DefaultFalse, + PosFlag, + NegFlag, + BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>; def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group, HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">; def hip_path_EQ : Joined<["--"], "hip-path=">, Group, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index c55f0457bc504..4d1209c69506d 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -758,13 +758,15 @@ void CodeGenModule::Release() { llvm::MDString::get(Ctx, CodeGenOpts.MemoryProfileOutput)); } - if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) { + if ((LangOpts.CUDAIsDevice || LangOpts.isSYCL()) && getTriple().isNVPTX()) { // Indicate whether __nvvm_reflect should be configured to flush denormal // floating point values to 0. (This corresponds to its "__CUDA_FTZ" // property.) getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", CodeGenOpts.FP32DenormalMode.Output != llvm::DenormalMode::IEEE); + getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt", + getTarget().getTargetOpts().NVVMCudaPrecSqrt); } if (LangOpts.EHAsynch) diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 339f51d210874..2185cbce833c0 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -170,6 +170,11 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { ReflectVal = Flag->getSExtValue(); } else if (ReflectArg == "__CUDA_ARCH") { ReflectVal = SmVersion * 10; + } else if (ReflectArg == "__CUDA_PREC_SQRT") { + // Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module flag. + if (auto *Flag = mdconst::extract_or_null( + F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt"))) + ReflectVal = Flag->getSExtValue(); } Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal)); ToRemove.push_back(Call); diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index 2b1c682644db3..b112a6e6ac500 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -819,6 +819,12 @@ which contains all the symbols required. GPU (SM 71), but it should work on any GPU compatible with SM 50 or above * The NVIDIA OpenCL headers conflict with the OpenCL headers required for this project and may cause compilation issues on some platforms +* `sycl::sqrt` is not correctly rounded by default as the SYCL specification + allows lower precision, when porting from CUDA it may be helpful to use + `-Xclang -fcuda-prec-sqrt` to use the correctly rounded square root, this is + significantly slower but matches the default precision used by `nvcc`, and + this `clang++` flag is equivalent to the `nvcc` `-prec-sqrt` flag, except that + it defaults to `false`. ### HIP back-end limitations From ade25554647d5a2053cbdbd863697674f092620c Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Tue, 14 Dec 2021 18:07:19 +0000 Subject: [PATCH 2/6] [SYCL][CUDA] Fix formatting --- llvm/lib/Target/NVPTX/NVVMReflect.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 2185cbce833c0..6ffc49a59a551 100644 --- a/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -171,7 +171,8 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) { } else if (ReflectArg == "__CUDA_ARCH") { ReflectVal = SmVersion * 10; } else if (ReflectArg == "__CUDA_PREC_SQRT") { - // Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module flag. + // Try to pull __CUDA_PREC_SQRT from the nvvm-reflect-prec-sqrt module + // flag. if (auto *Flag = mdconst::extract_or_null( F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt"))) ReflectVal = Flag->getSExtValue(); From 2f6ade7a309063f7796ab76eb3c12131a5ac88a6 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Wed, 15 Dec 2021 18:05:15 +0000 Subject: [PATCH 3/6] [SYCL][CUDA] Add nvvm-reflect test for prec-sqrt --- llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll index 57ab33798709a..696fd6d983549 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-module-flag.ll @@ -3,6 +3,7 @@ declare i32 @__nvvm_reflect(i8*) @str = private unnamed_addr addrspace(1) constant [11 x i8] c"__CUDA_FTZ\00" +@str.1 = private unnamed_addr addrspace(1) constant [17 x i8] c"__CUDA_PREC_SQRT\00" define i32 @foo() { %call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*)) @@ -10,5 +11,12 @@ define i32 @foo() { ret i32 %call } -!llvm.module.flags = !{!0} +define i32 @foo_sqrt() { + %call = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([17 x i8], [17 x i8] addrspace(1)* @str.1, i32 0, i32 0) to i8*)) + ; CHECK: ret i32 42 + ret i32 %call +} + +!llvm.module.flags = !{!0, !1} !0 = !{i32 4, !"nvvm-reflect-ftz", i32 42} +!1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 42} From 50372deacb774b2da15c76082e77382270d926e1 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Thu, 16 Dec 2021 10:34:23 +0000 Subject: [PATCH 4/6] [SYCL][CUDA] Fix nvvm ftz test --- clang/test/CodeGenCUDA/flush-denormals.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGenCUDA/flush-denormals.cu b/clang/test/CodeGenCUDA/flush-denormals.cu index b5abc29dea14b..f19132b0b4643 100644 --- a/clang/test/CodeGenCUDA/flush-denormals.cu +++ b/clang/test/CodeGenCUDA/flush-denormals.cu @@ -44,8 +44,8 @@ extern "C" __device__ void foo() {} // FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign" // NOFTZ-NOT: "denormal-fp-math-f32" -// PTXFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} +// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} // PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1} -// PTXNOFTZ:!llvm.module.flags = !{{{.*}}[[MODFLAG:![0-9]+]]} +// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}} // PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0} From 0ec92eca912d73ec98e8d3e6b6f0c6ea651f5f5d Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Thu, 16 Dec 2021 13:44:35 +0000 Subject: [PATCH 5/6] [SYCL][CUDA] Update NVPTX docs with new reflect flag --- llvm/docs/NVPTXUsage.rst | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index e4b5ace6b6340..0c4fe7924a0e7 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -343,19 +343,22 @@ Reflection Parameters The libdevice library currently uses the following reflection parameters to control code generation: -==================== ====================================================== -Flag Description -==================== ====================================================== -``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero -==================== ====================================================== +=========================== ====================================================== +Flag Description +=========================== ====================================================== +``__CUDA_FTZ=[0,1]`` Use optimized code paths that flush subnormals to zero +``__CUDA_PREC_SQRT=[0,1]`` Use precise square root +=========================== ====================================================== -The value of this flag is determined by the "nvvm-reflect-ftz" module flag. -The following sets the ftz flag to 1. +The value of these flags are determined by the "nvvm-reflect-ftz" and +"nvvm-reflect-prec-sqrt" module flags respectively. +The following sets the ftz flag to 1, and the precise sqrt flag to 1. .. code-block:: llvm - !llvm.module.flag = !{!0} + !llvm.module.flag = !{!0, !1} !0 = !{i32 4, !"nvvm-reflect-ftz", i32 1} + !1 = !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1} (``i32 4`` indicates that the value set here overrides the value in another module we link with. See the `LangRef ` From 9f722e6a25583fb3f73ac344da8aeff161b8650d Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Thu, 16 Dec 2021 14:39:17 +0000 Subject: [PATCH 6/6] [SYCL][CUDA] Add -fcuda-prec-sqrt flag test --- clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu diff --git a/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu b/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu new file mode 100644 index 0000000000000..9054f1ab785ec --- /dev/null +++ b/clang/test/CodeGenCUDA/nvvm-reflect-prec-sqrt.cu @@ -0,0 +1,11 @@ +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-prec-sqrt %s -o -| FileCheck --check-prefix=CHECK-ON %s +// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s + +#include "Inputs/cuda.h" + +// Check that the -fcuda-prec-sqrt flag correctly sets the nvvm-reflect module flags. + +extern "C" __device__ void foo() {} + +// CHECK-ON: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 1} +// CHECK-OFF: !{i32 4, !"nvvm-reflect-prec-sqrt", i32 0}